10596faeddefbf198de137d5e893708495ab1584cFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/* 4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho******************************************************************************* 583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius* Copyright (C) 2010-2012, International Business Machines 6b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* Corporation and others. All Rights Reserved. 7b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho******************************************************************************* 8b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* file name: ucharstrie.h 90596faeddefbf198de137d5e893708495ab1584cFredrik Roubert* encoding: UTF-8 10b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* tab size: 8 (not used) 11b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* indentation:4 12b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* 13b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* created on: 2010nov14 14b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* created by: Markus W. Scherer 15b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*/ 16b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 17b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#ifndef __UCHARSTRIE_H__ 18b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define __UCHARSTRIE_H__ 19b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 20b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/** 21b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * \file 22b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * \brief C++ API: Trie for mapping Unicode strings (or 16-bit-unit sequences) 23b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * to integer values. 24b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 25b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 26b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/utypes.h" 27b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/unistr.h" 28b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/uobject.h" 29b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/ustringtrie.h" 30b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 31b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoU_NAMESPACE_BEGIN 32b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 33b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass Appendable; 34b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass UCharsTrieBuilder; 35b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass UVector32; 36b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 37b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/** 38b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Light-weight, non-const reader class for a UCharsTrie. 390596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * Traverses a char16_t-serialized data structure with minimal state, 40b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * for mapping strings (16-bit-unit sequences) to non-negative integer values. 41b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 42b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * This class owns the serialized trie data only if it was constructed by 43b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * the builder's build() method. 44b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * The public constructor and the copy constructor only alias the data (only copy the pointer). 45b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * There is no assignment operator. 46b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 47b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * This class is not intended for public subclassing. 4883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 49b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 50b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass U_COMMON_API UCharsTrie : public UMemory { 51b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 52b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 53b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Constructs a UCharsTrie reader instance. 54b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 550596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * The trieUChars must contain a copy of a char16_t sequence from the UCharsTrieBuilder, 560596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * starting with the first char16_t of that sequence. 570596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * The UCharsTrie object will not read more char16_ts than 58b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * the UCharsTrieBuilder generated in the corresponding build() call. 59b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 60b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * The array is not copied/cloned and must not be modified while 61b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * the UCharsTrie object is in use. 62b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 630596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * @param trieUChars The char16_t array that contains the serialized trie. 6483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 65b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 660596faeddefbf198de137d5e893708495ab1584cFredrik Roubert UCharsTrie(ConstChar16Ptr trieUChars) 67b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : ownedArray_(NULL), uchars_(trieUChars), 68b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_(uchars_), remainingMatchLength_(-1) {} 69b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 70b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 71b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Destructor. 7283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 73b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 74b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ~UCharsTrie(); 75b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 76b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 77b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Copy constructor, copies the other trie reader object and its state, 780596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * but not the char16_t array which will be shared. (Shallow copy.) 79b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param other Another UCharsTrie object. 8083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 81b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 82b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCharsTrie(const UCharsTrie &other) 83b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : ownedArray_(NULL), uchars_(other.uchars_), 84b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} 85b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 86b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 87b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Resets this trie to its initial state. 88b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return *this 8983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 90b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 91b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCharsTrie &reset() { 92b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_=uchars_; 93b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho remainingMatchLength_=-1; 94b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return *this; 95b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 96b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 97b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 98b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * UCharsTrie state object, for saving a trie's current state 99b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * and resetting the trie back to this state later. 10083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 101b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 102b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho class State : public UMemory { 103b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho public: 104b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 105b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Constructs an empty State. 10683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 107b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 108b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho State() { uchars=NULL; } 109b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho private: 110b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho friend class UCharsTrie; 111b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 1120596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *uchars; 1130596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *pos; 114b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t remainingMatchLength; 115b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho }; 116b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 117b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 118b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Saves the state of this trie. 119b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param state The State object to hold the trie's state. 120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return *this 121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @see resetToState 12283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 123b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 124b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const UCharsTrie &saveState(State &state) const { 125b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho state.uchars=uchars_; 126b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho state.pos=pos_; 127b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho state.remainingMatchLength=remainingMatchLength_; 128b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return *this; 129b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 130b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 131b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 132b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Resets this trie to the saved state. 133b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * If the state object contains no state, or the state of a different trie, 134b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * then this trie remains unchanged. 135b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param state The State object which holds a saved trie state. 136b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return *this 137b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @see saveState 138b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @see reset 13983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 140b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 141b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCharsTrie &resetToState(const State &state) { 142b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(uchars_==state.uchars && uchars_!=NULL) { 143b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_=state.pos; 144b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho remainingMatchLength_=state.remainingMatchLength; 145b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 146b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return *this; 147b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 148b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 149b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 150b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Determines whether the string so far matches, whether it has a value, 1510596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * and whether another input char16_t can continue a matching string. 152b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 15383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 154b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 155b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult current() const; 156b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 157b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 1580596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * Traverses the trie from the initial state for this input char16_t. 159b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Equivalent to reset().next(uchar). 160b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param uchar Input char value. Values below 0 and above 0xffff will never match. 161b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 16283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 163b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 164b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho inline UStringTrieResult first(int32_t uchar) { 165b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho remainingMatchLength_=-1; 166b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return nextImpl(uchars_, uchar); 167b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 168b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 169b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 170b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Traverses the trie from the initial state for the 171b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * one or two UTF-16 code units for this input code point. 172b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Equivalent to reset().nextForCodePoint(cp). 173b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param cp A Unicode code point 0..0x10ffff. 174b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 17583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 176b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 17783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UStringTrieResult firstForCodePoint(UChar32 cp); 178b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 179b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 1800596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * Traverses the trie from the current state for this input char16_t. 181b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param uchar Input char value. Values below 0 and above 0xffff will never match. 182b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 18383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 184b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 185b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult next(int32_t uchar); 186b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 187b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 188b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Traverses the trie from the current state for the 189b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * one or two UTF-16 code units for this input code point. 190b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param cp A Unicode code point 0..0x10ffff. 191b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 19283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 193b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 19483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UStringTrieResult nextForCodePoint(UChar32 cp); 195b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 196b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 197b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Traverses the trie from the current state for this string. 198b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Equivalent to 199b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * \code 200b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Result result=current(); 201b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * for(each c in s) 202b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; 203b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * result=next(c); 204b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * return result; 205b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * \endcode 206b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param s A string. Can be NULL if length is 0. 207b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param length The length of the string. Can be -1 if NUL-terminated. 208b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 20983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 210b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 2110596faeddefbf198de137d5e893708495ab1584cFredrik Roubert UStringTrieResult next(ConstChar16Ptr s, int32_t length); 212b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 213b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 214b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Returns a matching string's value if called immediately after 215b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. 216b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * getValue() can be called multiple times. 217b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 218b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! 219b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The value for the string so far. 22083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 221b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 222b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho inline int32_t getValue() const { 2230596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *pos=pos_; 224b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t leadUnit=*pos++; 225b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // U_ASSERT(leadUnit>=kMinValueLead); 226b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return leadUnit&kValueIsFinal ? 227b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit); 228b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 229b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 230b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 231b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Determines whether all strings reachable from the current state 232b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * map to the same value. 233b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param uniqueValue Receives the unique value, if this function returns TRUE. 234b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * (output-only) 235b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return TRUE if all strings reachable from the current state 236b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * map to the same value. 23783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 238b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 239b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho inline UBool hasUniqueValue(int32_t &uniqueValue) const { 2400596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *pos=pos_; 241b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Skip the rest of a pending linear-match node. 242b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue); 243b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 244b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 245b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 2460596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * Finds each char16_t which continues the string from the current state. 2470596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * That is, each char16_t c for which it would be next(c)!=USTRINGTRIE_NO_MATCH now. 2480596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * @param out Each next char16_t is appended to this object. 2490596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * @return the number of char16_ts which continue the string from here 25083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 251b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 252b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t getNextUChars(Appendable &out) const; 253b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 254b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 255b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Iterator for all of the (string, value) pairs in a UCharsTrie. 25683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 257b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 258b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho class U_COMMON_API Iterator : public UMemory { 259b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho public: 260b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 2610596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * Iterates from the root of a char16_t-serialized UCharsTrie. 2620596faeddefbf198de137d5e893708495ab1584cFredrik Roubert * @param trieUChars The trie char16_ts. 263b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param maxStringLength If 0, the iterator returns full strings. 264b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Otherwise, the iterator returns strings with this maximum length. 265b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param errorCode Standard ICU error code. Its input value must 266b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * pass the U_SUCCESS() test, or else the function returns 267b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * immediately. Check for U_FAILURE() on output or use with 268b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * function chaining. (See User Guide for details.) 26983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 270b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 2710596faeddefbf198de137d5e893708495ab1584cFredrik Roubert Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode); 272b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 273b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 274b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Iterates from the current state of the specified UCharsTrie. 275b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param trie The trie whose state will be copied for iteration. 276b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param maxStringLength If 0, the iterator returns full strings. 277b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Otherwise, the iterator returns strings with this maximum length. 278b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param errorCode Standard ICU error code. Its input value must 279b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * pass the U_SUCCESS() test, or else the function returns 280b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * immediately. Check for U_FAILURE() on output or use with 281b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * function chaining. (See User Guide for details.) 28283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 283b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 284b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); 285b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 286b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 287b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Destructor. 28883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 289b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 290b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ~Iterator(); 291b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 292b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 293b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Resets this iterator to its initial state. 294b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return *this 29583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 296b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 297b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho Iterator &reset(); 298b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 299b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 300b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return TRUE if there are more elements. 30183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 302b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 303b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool hasNext() const; 304b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 305b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 306b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Finds the next (string, value) pair if there is one. 307b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 308b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * If the string is truncated to the maximum length and does not 309b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * have a real value, then the value is set to -1. 310b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * In this case, this "not a real value" is indistinguishable from 311b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * a real value of -1. 312b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param errorCode Standard ICU error code. Its input value must 313b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * pass the U_SUCCESS() test, or else the function returns 314b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * immediately. Check for U_FAILURE() on output or use with 315b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * function chaining. (See User Guide for details.) 316b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return TRUE if there is another element. 31783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 318b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 319b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool next(UErrorCode &errorCode); 320b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 321b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 322b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The string for the last successful next(). 32383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 324b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 325b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const UnicodeString &getString() const { return str_; } 326b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 327b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The value for the last successful next(). 32883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * @stable ICU 4.8 329b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 330b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t getValue() const { return value_; } 331b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 332b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho private: 333b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool truncateAndStop() { 334b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_=NULL; 335b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho value_=-1; // no real value for str 336b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return TRUE; 337b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 338b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 3390596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode); 340b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 3410596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *uchars_; 3420596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *pos_; 3430596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *initialPos_; 344b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t remainingMatchLength_; 345b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t initialRemainingMatchLength_; 346b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool skipValue_; // Skip intermediate value which was already delivered. 347b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 348b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UnicodeString str_; 349b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t maxLength_; 350b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t value_; 351b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 352b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The stack stores pairs of integers for backtracking to another 353b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // outbound edge of a branch node. 354b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The first integer is an offset from uchars_. 355b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The second integer has the str_.length() from before the node in bits 15..0, 356b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // and the remaining branch length in bits 31..16. 357b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit, 358b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // but the code looks more confusing that way.) 359b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UVector32 *stack_; 360b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho }; 361b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 362b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprivate: 363b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho friend class UCharsTrieBuilder; 364b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 365b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 366b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Constructs a UCharsTrie reader instance. 367b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Unlike the public constructor which just aliases an array, 368b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * this constructor adopts the builder's array. 369b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * This constructor is only called by the builder. 370b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 3710596faeddefbf198de137d5e893708495ab1584cFredrik Roubert UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars) 372b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : ownedArray_(adoptUChars), uchars_(trieUChars), 373b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_(uchars_), remainingMatchLength_(-1) {} 374b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 375b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // No assignment operator. 376b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UCharsTrie &operator=(const UCharsTrie &other); 377b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 378b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho inline void stop() { 379b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_=NULL; 380b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 381b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 382b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Reads a compact 32-bit integer. 383b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // pos is already after the leadUnit, and the lead unit has bit 15 reset. 3840596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) { 385b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t value; 386b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(leadUnit<kMinTwoUnitValueLead) { 387b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho value=leadUnit; 388b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else if(leadUnit<kThreeUnitValueLead) { 389b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos; 390b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 391b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho value=(pos[0]<<16)|pos[1]; 392b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 393b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return value; 394b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 3950596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) { 396b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(leadUnit>=kMinTwoUnitValueLead) { 397b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(leadUnit<kThreeUnitValueLead) { 398b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++pos; 399b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 400b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos+=2; 401b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 402b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 403b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return pos; 404b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4050596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static inline const char16_t *skipValue(const char16_t *pos) { 406b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t leadUnit=*pos++; 407b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return skipValue(pos, leadUnit&0x7fff); 408b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 409b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4100596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) { 411b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 412b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t value; 413b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(leadUnit<kMinTwoUnitNodeValueLead) { 414b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho value=(leadUnit>>6)-1; 415b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else if(leadUnit<kThreeUnitNodeValueLead) { 416b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos; 417b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 418b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho value=(pos[0]<<16)|pos[1]; 419b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 420b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return value; 421b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 4220596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) { 423b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 424b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(leadUnit>=kMinTwoUnitNodeValueLead) { 425b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(leadUnit<kThreeUnitNodeValueLead) { 426b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++pos; 427b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 428b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos+=2; 429b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 430b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 431b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return pos; 432b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 433b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4340596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static inline const char16_t *jumpByDelta(const char16_t *pos) { 435b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t delta=*pos++; 436b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(delta>=kMinTwoUnitDeltaLead) { 437b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(delta==kThreeUnitDeltaLead) { 438b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delta=(pos[0]<<16)|pos[1]; 439b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos+=2; 440b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 441b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++; 442b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 443b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 444b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return pos+delta; 445b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 446b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 4470596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static const char16_t *skipDelta(const char16_t *pos) { 448b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t delta=*pos++; 449b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(delta>=kMinTwoUnitDeltaLead) { 450b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(delta==kThreeUnitDeltaLead) { 451b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos+=2; 452b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 453b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++pos; 454b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 455b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 456b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return pos; 457b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 458b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 459b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static inline UStringTrieResult valueResult(int32_t node) { 460b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node>>15)); 461b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 462b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 463b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Handles a branch node for both next(uchar) and next(string). 4640596faeddefbf198de137d5e893708495ab1584cFredrik Roubert UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar); 465b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 466b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Requires remainingLength_<0. 4670596faeddefbf198de137d5e893708495ab1584cFredrik Roubert UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar); 468b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 469b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Helper functions for hasUniqueValue(). 470b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Recursively finds a unique value (or whether there is not a unique one) 471b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // from a branch. 4720596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length, 473b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool haveUniqueValue, int32_t &uniqueValue); 474b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Recursively finds a unique value (or whether there is not a unique one) 475b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // starting from a position on a node lead unit. 4760596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); 477b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 478b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Helper functions for getNextUChars(). 479b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // getNextUChars() when pos is on a branch node. 4800596faeddefbf198de137d5e893708495ab1584cFredrik Roubert static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out); 481b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 482b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // UCharsTrie data structure 483b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 4840596faeddefbf198de137d5e893708495ab1584cFredrik Roubert // The trie consists of a series of char16_t-serialized nodes for incremental 4850596faeddefbf198de137d5e893708495ab1584cFredrik Roubert // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer) 486b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The root node is at the beginning of the trie data. 487b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 488b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Types of nodes are distinguished by their node lead unit ranges. 489b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // After each node, except a final-value node, another node follows to 490b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // encode match values or continue matching further units. 491b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 492b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Node types: 493b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - Final-value node: Stores a 32-bit integer in a compact, variable-length format. 4940596faeddefbf198de137d5e893708495ab1584cFredrik Roubert // The value is for the string/char16_t sequence so far. 495b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - Match node, optionally with an intermediate value in a different compact format. 4960596faeddefbf198de137d5e893708495ab1584cFredrik Roubert // The value, if present, is for the string/char16_t sequence so far. 497b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 498b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Aside from the value, which uses the node lead unit's high bits: 499b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 500b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - Linear-match node: Matches a number of units. 501b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - Branch node: Branches to other nodes according to the current input unit. 502b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The node unit is the length of the branch (number of units to select from) 503b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // minus 1. It is followed by a sub-node: 504b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - If the length is at most kMaxBranchLinearSubNodeLength, then 505b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // there are length-1 (key, value) pairs and then one more comparison unit. 506b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If one of the key units matches, then the value is either a final value for 507b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the string so far, or a "jump" delta to the next node. 508b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If the last unit matches, then matching continues with the next node. 509b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (Values have the same encoding as final-value nodes.) 510b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - If the length is greater than kMaxBranchLinearSubNodeLength, then 511b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // there is one unit and one "jump" delta. 512b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If the input unit is less than the sub-node unit, then "jump" by delta to 513b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the next sub-node which will have a length of length/2. 514b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (The delta has its own compact encoding.) 515b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Otherwise, skip the "jump" delta to the next sub-node 516b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // which will have a length of length-length/2. 517b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 518b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Match-node lead unit values, after masking off intermediate-value bits: 519b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 520b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise 521b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the length is one more than the next unit. 522b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 523b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // For a branch sub-node with at most this many entries, we drop down 524b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // to a linear search. 525b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxBranchLinearSubNodeLength=5; 526b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 527b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. 528b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinLinearMatch=0x30; 529b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxLinearMatchLength=0x10; 530b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 531b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Match-node lead unit bits 14..6 for the optional intermediate value. 532b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If these bits are 0, then there is no intermediate value. 533b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Otherwise, see the *NodeValue* constants below. 534b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040 535b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f 536b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 537b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // A final-value node has bit 15 set. 538b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kValueIsFinal=0x8000; 539b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 540b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Compact value: After testing and masking off bit 15, use the following thresholds. 541b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxOneUnitValue=0x3fff; 542b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 543b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000 544b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kThreeUnitValueLead=0x7fff; 545b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 546b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff 547b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 548b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Compact intermediate-value integer, lead unit shared with a branch or linear-match node. 549b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxOneUnitNodeValue=0xff; 550b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040 551b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kThreeUnitNodeValueLead=0x7fc0; 552b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 553b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxTwoUnitNodeValue= 554b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff 555b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 556b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Compact delta integers. 557b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxOneUnitDelta=0xfbff; 558b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00 559b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kThreeUnitDeltaLead=0xffff; 560b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 561b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff 562b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 5630596faeddefbf198de137d5e893708495ab1584cFredrik Roubert char16_t *ownedArray_; 564b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 565b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Fixed value referencing the UCharsTrie words. 5660596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *uchars_; 567b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 568b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Iterator variables. 569b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 570b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Pointer to next trie unit to read. NULL if no more matches. 5710596faeddefbf198de137d5e893708495ab1584cFredrik Roubert const char16_t *pos_; 572b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 573b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t remainingMatchLength_; 574b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 575b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 576b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoU_NAMESPACE_END 577b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 578b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#endif // __UCHARSTRIE_H__ 579