bytestrie.h revision 103e9ffba2cba345d0078eb8b8db33249f81840a
1b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/* 2b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho******************************************************************************* 3103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius* Copyright (C) 2010-2012, International Business Machines 4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* Corporation and others. All Rights Reserved. 5b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho******************************************************************************* 6b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* file name: bytestrie.h 7b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* encoding: US-ASCII 8b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* tab size: 8 (not used) 9b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* indentation:4 10b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* 11b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* created on: 2010sep25 12b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* created by: Markus W. Scherer 13b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho*/ 14b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 15b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#ifndef __BYTESTRIE_H__ 16b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define __BYTESTRIE_H__ 17b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 18b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/** 19b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * \file 20b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * \brief C++ API: Trie for mapping byte sequences to integer values. 21b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 22b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 23b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/utypes.h" 24b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/stringpiece.h" 25b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/uobject.h" 26b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "unicode/ustringtrie.h" 27b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 28b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoU_NAMESPACE_BEGIN 29b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 30b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass ByteSink; 31b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass BytesTrieBuilder; 32b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass CharString; 33b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass UVector32; 34b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 35b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho/** 36b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Light-weight, non-const reader class for a BytesTrie. 37b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Traverses a byte-serialized data structure with minimal state, 38b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * for mapping byte sequences to non-negative integer values. 39b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 40b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * This class owns the serialized trie data only if it was constructed by 41b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * the builder's build() method. 42b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * The public constructor and the copy constructor only alias the data (only copy the pointer). 43b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * There is no assignment operator. 44b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 45b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * This class is not intended for public subclassing. 46103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 47b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 48b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoclass U_COMMON_API BytesTrie : public UMemory { 49b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehopublic: 50b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 51b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Constructs a BytesTrie reader instance. 52b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 53b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder, 54b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * starting with the first byte of that sequence. 55b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * The BytesTrie object will not read more bytes than 56b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * the BytesTrieBuilder generated in the corresponding build() call. 57b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 58b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * The array is not copied/cloned and must not be modified while 59b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * the BytesTrie object is in use. 60b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 61b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param trieBytes The byte array that contains the serialized trie. 62103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 63b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 64b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrie(const void *trieBytes) 65b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : ownedArray_(NULL), bytes_(reinterpret_cast<const uint8_t *>(trieBytes)), 66b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_(bytes_), remainingMatchLength_(-1) {} 67b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 68b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 69b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Destructor. 70103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 71b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 72b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ~BytesTrie(); 73b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 74b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 75b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Copy constructor, copies the other trie reader object and its state, 76b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * but not the byte array which will be shared. (Shallow copy.) 77b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param other Another BytesTrie object. 78103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 79b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 80b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrie(const BytesTrie &other) 81b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : ownedArray_(NULL), bytes_(other.bytes_), 82b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} 83b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 84b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 85b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Resets this trie to its initial state. 86b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return *this 87103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 88b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 89b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrie &reset() { 90b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_=bytes_; 91b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho remainingMatchLength_=-1; 92b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return *this; 93b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 94b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 95b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 96b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * BytesTrie state object, for saving a trie's current state 97b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * and resetting the trie back to this state later. 98103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 99b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 100b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho class State : public UMemory { 101b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho public: 102b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 103b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Constructs an empty State. 104103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 105b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 106b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho State() { bytes=NULL; } 107b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho private: 108b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho friend class BytesTrie; 109b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 110b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *bytes; 111b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *pos; 112b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t remainingMatchLength; 113b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho }; 114b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 115b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 116b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Saves the state of this trie. 117b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param state The State object to hold the trie's state. 118b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return *this 119b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @see resetToState 120103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 122b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const BytesTrie &saveState(State &state) const { 123b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho state.bytes=bytes_; 124b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho state.pos=pos_; 125b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho state.remainingMatchLength=remainingMatchLength_; 126b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return *this; 127b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 128b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 129b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 130b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Resets this trie to the saved state. 131b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * If the state object contains no state, or the state of a different trie, 132b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * then this trie remains unchanged. 133b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param state The State object which holds a saved trie state. 134b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return *this 135b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @see saveState 136b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @see reset 137103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 138b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 139b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrie &resetToState(const State &state) { 140b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(bytes_==state.bytes && bytes_!=NULL) { 141b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_=state.pos; 142b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho remainingMatchLength_=state.remainingMatchLength; 143b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 144b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return *this; 145b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 146b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 147b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 148b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Determines whether the byte sequence so far matches, whether it has a value, 149b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * and whether another input byte can continue a matching byte sequence. 150b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 151103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 152b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 153b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult current() const; 154b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 155b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 156b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Traverses the trie from the initial state for this input byte. 157b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Equivalent to reset().next(inByte). 158b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 159b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Values below -0x100 and above 0xff will never match. 160b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 161103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 162b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 163b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho inline UStringTrieResult first(int32_t inByte) { 164b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho remainingMatchLength_=-1; 165b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(inByte<0) { 166b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho inByte+=0x100; 167b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 168b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return nextImpl(bytes_, inByte); 169b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 170b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 171b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 172b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Traverses the trie from the current state for this input byte. 173b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 174b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Values below -0x100 and above 0xff will never match. 175b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 176103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 177b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 178b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult next(int32_t inByte); 179b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 180b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 181b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Traverses the trie from the current state for this byte sequence. 182b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Equivalent to 183b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * \code 184b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Result result=current(); 185b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * for(each c in s) 186b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; 187b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * result=next(c); 188b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * return result; 189b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * \endcode 190b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param s A string or byte sequence. Can be NULL if length is 0. 191b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param length The length of the byte sequence. Can be -1 if NUL-terminated. 192b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The match/value Result. 193103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 194b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 195b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult next(const char *s, int32_t length); 196b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 197b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 198b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Returns a matching byte sequence's value if called immediately after 199b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. 200b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * getValue() can be called multiple times. 201b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 202b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! 203b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The value for the byte sequence so far. 204103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 205b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 206b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho inline int32_t getValue() const { 207b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *pos=pos_; 208b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t leadByte=*pos++; 209b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // U_ASSERT(leadByte>=kMinValueLead); 210b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return readValue(pos, leadByte>>1); 211b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 212b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 213b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 214b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Determines whether all byte sequences reachable from the current state 215b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * map to the same value. 216b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param uniqueValue Receives the unique value, if this function returns TRUE. 217b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * (output-only) 218b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return TRUE if all byte sequences reachable from the current state 219b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * map to the same value. 220103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 221b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 222b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho inline UBool hasUniqueValue(int32_t &uniqueValue) const { 223b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *pos=pos_; 224b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Skip the rest of a pending linear-match node. 225b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue); 226b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 227b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 228b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 229b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Finds each byte which continues the byte sequence from the current state. 230b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now. 231b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param out Each next byte is appended to this object. 232b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * (Only uses the out.Append(s, length) method.) 233b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return the number of bytes which continue the byte sequence from here 234103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 235b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 236b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t getNextBytes(ByteSink &out) const; 237b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 238b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 239b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 240103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 241b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 242b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho class U_COMMON_API Iterator : public UMemory { 243b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho public: 244b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 245b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Iterates from the root of a byte-serialized BytesTrie. 246b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param trieBytes The trie bytes. 247b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 248b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Otherwise, the iterator returns strings with this maximum length. 249b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param errorCode Standard ICU error code. Its input value must 250b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * pass the U_SUCCESS() test, or else the function returns 251b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * immediately. Check for U_FAILURE() on output or use with 252b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * function chaining. (See User Guide for details.) 253103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 254b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 255b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode); 256b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 257b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 258b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Iterates from the current state of the specified BytesTrie. 259b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param trie The trie whose state will be copied for iteration. 260b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 261b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Otherwise, the iterator returns strings with this maximum length. 262b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param errorCode Standard ICU error code. Its input value must 263b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * pass the U_SUCCESS() test, or else the function returns 264b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * immediately. Check for U_FAILURE() on output or use with 265b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * function chaining. (See User Guide for details.) 266103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 267b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 268b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); 269b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 270b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 271b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Destructor. 272103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 273b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 274b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ~Iterator(); 275b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 276b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 277b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Resets this iterator to its initial state. 278b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return *this 279103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 280b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 281b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho Iterator &reset(); 282b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 283b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 284b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return TRUE if there are more elements. 285103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 286b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 287b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool hasNext() const; 288b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 289b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 290b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Finds the next (byte sequence, value) pair if there is one. 291b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * 292b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * If the byte sequence is truncated to the maximum length and does not 293b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * have a real value, then the value is set to -1. 294b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * In this case, this "not a real value" is indistinguishable from 295b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * a real value of -1. 296b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @param errorCode Standard ICU error code. Its input value must 297b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * pass the U_SUCCESS() test, or else the function returns 298b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * immediately. Check for U_FAILURE() on output or use with 299b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * function chaining. (See User Guide for details.) 300b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return TRUE if there is another element. 301103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 302b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 303b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool next(UErrorCode &errorCode); 304b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 305b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 306b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The NUL-terminated byte sequence for the last successful next(). 307103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 308b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 309b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const StringPiece &getString() const { return sp_; } 310b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 311b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * @return The value for the last successful next(). 312103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius * @stable ICU 4.8 313b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 314b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t getValue() const { return value_; } 315b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 316b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho private: 317b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool truncateAndStop(); 318b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 319b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode); 320b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 321b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *bytes_; 322b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *pos_; 323b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *initialPos_; 324b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t remainingMatchLength_; 325b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t initialRemainingMatchLength_; 326b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 327b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho CharString *str_; 328b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho StringPiece sp_; 329b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t maxLength_; 330b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t value_; 331b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 332b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The stack stores pairs of integers for backtracking to another 333b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // outbound edge of a branch node. 334b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The first integer is an offset from bytes_. 335b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The second integer has the str_->length() from before the node in bits 15..0, 336b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 337b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 338b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // but the code looks more confusing that way.) 339b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UVector32 *stack_; 340b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho }; 341b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 342b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoprivate: 343b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho friend class BytesTrieBuilder; 344b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 345b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /** 346b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Constructs a BytesTrie reader instance. 347b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Unlike the public constructor which just aliases an array, 348b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * this constructor adopts the builder's array. 349b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * This constructor is only called by the builder. 350b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 351b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrie(void *adoptBytes, const void *trieBytes) 352b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : ownedArray_(reinterpret_cast<uint8_t *>(adoptBytes)), 353b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho bytes_(reinterpret_cast<const uint8_t *>(trieBytes)), 354b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_(bytes_), remainingMatchLength_(-1) {} 355b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 356b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // No assignment operator. 357b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho BytesTrie &operator=(const BytesTrie &other); 358b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 359b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho inline void stop() { 360b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos_=NULL; 361b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 362b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 363b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Reads a compact 32-bit integer. 364b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // pos is already after the leadByte, and the lead byte is already shifted right by 1. 365b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static int32_t readValue(const uint8_t *pos, int32_t leadByte); 366b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) { 367b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // U_ASSERT(leadByte>=kMinValueLead); 368b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(leadByte>=(kMinTwoByteValueLead<<1)) { 369b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(leadByte<(kMinThreeByteValueLead<<1)) { 370b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++pos; 371b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else if(leadByte<(kFourByteValueLead<<1)) { 372b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos+=2; 373b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 374b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos+=3+((leadByte>>1)&1); 375b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 376b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 377b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return pos; 378b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 379b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static inline const uint8_t *skipValue(const uint8_t *pos) { 380b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t leadByte=*pos++; 381b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return skipValue(pos, leadByte); 382b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 383b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 384b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Reads a jump delta and jumps. 385b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const uint8_t *jumpByDelta(const uint8_t *pos); 386b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 387b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static inline const uint8_t *skipDelta(const uint8_t *pos) { 388b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t delta=*pos++; 389b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(delta>=kMinTwoByteDeltaLead) { 390b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(delta<kMinThreeByteDeltaLead) { 391b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho ++pos; 392b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else if(delta<kFourByteDeltaLead) { 393b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos+=2; 394b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 395b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pos+=3+(delta&1); 396b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 397b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 398b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return pos; 399b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 400b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 401b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static inline UStringTrieResult valueResult(int32_t node) { 402b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal)); 403b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 404b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 405b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Handles a branch node for both next(byte) and next(string). 406b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte); 407b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 408b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Requires remainingLength_<0. 409b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte); 410b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 411b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Helper functions for hasUniqueValue(). 412b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Recursively finds a unique value (or whether there is not a unique one) 413b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // from a branch. 414b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length, 415b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UBool haveUniqueValue, int32_t &uniqueValue); 416b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Recursively finds a unique value (or whether there is not a unique one) 417b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // starting from a position on a node lead byte. 418b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); 419b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 420b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Helper functions for getNextBytes(). 421b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // getNextBytes() when pos is on a branch node. 422b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out); 423b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static void append(ByteSink &out, int c); 424b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 425b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // BytesTrie data structure 426b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 427b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The trie consists of a series of byte-serialized nodes for incremental 428b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // string/byte sequence matching. The root node is at the beginning of the trie data. 429b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 430b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Types of nodes are distinguished by their node lead byte ranges. 431b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // After each node, except a final-value node, another node follows to 432b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // encode match values or continue matching further bytes. 433b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 434b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Node types: 435b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - Value node: Stores a 32-bit integer in a compact, variable-length format. 436b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The value is for the string/byte sequence so far. 437b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // One node bit indicates whether the value is final or whether 438b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // matching continues with the next node. 439b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - Linear-match node: Matches a number of bytes. 440b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - Branch node: Branches to other nodes according to the current input byte. 441b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The node byte is the length of the branch (number of bytes to select from) 442b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // minus 1. It is followed by a sub-node: 443b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - If the length is at most kMaxBranchLinearSubNodeLength, then 444b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // there are length-1 (key, value) pairs and then one more comparison byte. 445b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If one of the key bytes matches, then the value is either a final value for 446b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the string/byte sequence so far, or a "jump" delta to the next node. 447b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If the last byte matches, then matching continues with the next node. 448b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (Values have the same encoding as value nodes.) 449b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // - If the length is greater than kMaxBranchLinearSubNodeLength, then 450b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // there is one byte and one "jump" delta. 451b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If the input byte is less than the sub-node byte, then "jump" by delta to 452b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the next sub-node which will have a length of length/2. 453b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // (The delta has its own compact encoding.) 454b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Otherwise, skip the "jump" delta to the next sub-node 455b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // which will have a length of length-length/2. 456b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 457b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Node lead byte values. 458b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 459b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 460b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the length is one more than the next byte. 461b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 462b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // For a branch sub-node with at most this many entries, we drop down 463b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // to a linear search. 464b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxBranchLinearSubNodeLength=5; 465b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 466b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 467b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinLinearMatch=0x10; 468b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxLinearMatchLength=0x10; 469b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 470b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // 20..ff: Variable-length value node. 471b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 472b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Then shift-right by 1 bit. 473b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The remaining lead byte value indicates the number of following bytes (0..4) 474b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // and contains the value's top bits. 475b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 476b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // It is a final value if bit 0 is set. 477b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kValueIsFinal=1; 478b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 479b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 480b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10 481b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 482b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 483b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 484b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxTwoByteValue=0x1aff; 485b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 486b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 487b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kFourByteValueLead=0x7e; 488b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 489b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // A little more than Unicode code points. (0x11ffff) 490b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 491b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 492b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kFiveByteValueLead=0x7f; 493b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 494b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Compact delta integers. 495b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxOneByteDelta=0xbf; 496b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 497b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMinThreeByteDeltaLead=0xf0; 498b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kFourByteDeltaLead=0xfe; 499b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kFiveByteDeltaLead=0xff; 500b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 501b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 502b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 503b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 504b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho uint8_t *ownedArray_; 505b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 506b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Fixed value referencing the BytesTrie bytes. 507b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *bytes_; 508b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 509b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Iterator variables. 510b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 511b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Pointer to next trie byte to read. NULL if no more matches. 512b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho const uint8_t *pos_; 513b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 514b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t remainingMatchLength_; 515b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho}; 516b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 517b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoU_NAMESPACE_END 518b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 519b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#endif // __BYTESTRIE_H__ 520