16f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/* 26f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org******************************************************************************* 36f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Copyright (C) 2010-2012, International Business Machines 46f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* Corporation and others. All Rights Reserved. 56f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org******************************************************************************* 66f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* file name: bytestrie.h 76f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* encoding: US-ASCII 86f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* tab size: 8 (not used) 96f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* indentation:4 106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* 116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created on: 2010sep25 126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org* created by: Markus W. Scherer 136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org*/ 146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#ifndef __BYTESTRIE_H__ 166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#define __BYTESTRIE_H__ 176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * \file 206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * \brief C++ API: Trie for mapping byte sequences to integer values. 216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/utypes.h" 246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/stringpiece.h" 256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/uobject.h" 266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#include "unicode/ustringtrie.h" 276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_BEGIN 296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass ByteSink; 316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass BytesTrieBuilder; 326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass CharString; 336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass UVector32; 346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org/** 366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Light-weight, non-const reader class for a BytesTrie. 376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Traverses a byte-serialized data structure with minimal state, 386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * for mapping byte sequences to non-negative integer values. 396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This class owns the serialized trie data only if it was constructed by 416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the builder's build() method. 426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * The public constructor and the copy constructor only alias the data (only copy the pointer). 436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * There is no assignment operator. 446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This class is not intended for public subclassing. 466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgclass U_COMMON_API BytesTrie : public UMemory { 496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgpublic: 506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Constructs a BytesTrie reader instance. 526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder, 546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * starting with the first byte of that sequence. 556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * The BytesTrie object will not read more bytes than 566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the BytesTrieBuilder generated in the corresponding build() call. 576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * The array is not copied/cloned and must not be modified while 596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * the BytesTrie object is in use. 606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param trieBytes The byte array that contains the serialized trie. 626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrie(const void *trieBytes) 656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)), 666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos_(bytes_), remainingMatchLength_(-1) {} 676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Destructor. 706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ~BytesTrie(); 736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Copy constructor, copies the other trie reader object and its state, 766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * but not the byte array which will be shared. (Shallow copy.) 776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param other Another BytesTrie object. 786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrie(const BytesTrie &other) 816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : ownedArray_(NULL), bytes_(other.bytes_), 826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} 836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Resets this trie to its initial state. 866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return *this 876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrie &reset() { 906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos_=bytes_; 916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org remainingMatchLength_=-1; 926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return *this; 936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * BytesTrie state object, for saving a trie's current state 976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and resetting the trie back to this state later. 986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org class State : public UMemory { 1016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org public: 1026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Constructs an empty State. 1046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 1056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org State() { bytes=NULL; } 1076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org private: 1086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org friend class BytesTrie; 1096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *bytes; 1116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *pos; 1126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t remainingMatchLength; 1136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org }; 1146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Saves the state of this trie. 1176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param state The State object to hold the trie's state. 1186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return *this 1196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @see resetToState 1206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 1216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const BytesTrie &saveState(State &state) const { 1236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org state.bytes=bytes_; 1246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org state.pos=pos_; 1256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org state.remainingMatchLength=remainingMatchLength_; 1266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return *this; 1276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Resets this trie to the saved state. 1316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * If the state object contains no state, or the state of a different trie, 1326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * then this trie remains unchanged. 1336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param state The State object which holds a saved trie state. 1346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return *this 1356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @see saveState 1366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @see reset 1376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 1386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrie &resetToState(const State &state) { 1406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(bytes_==state.bytes && bytes_!=NULL) { 1416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos_=state.pos; 1426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org remainingMatchLength_=state.remainingMatchLength; 1436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return *this; 1456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Determines whether the byte sequence so far matches, whether it has a value, 1496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * and whether another input byte can continue a matching byte sequence. 1506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return The match/value Result. 1516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 1526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStringTrieResult current() const; 1546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Traverses the trie from the initial state for this input byte. 1576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Equivalent to reset().next(inByte). 1586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 1596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Values below -0x100 and above 0xff will never match. 1606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return The match/value Result. 1616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 1626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org inline UStringTrieResult first(int32_t inByte) { 1646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org remainingMatchLength_=-1; 1656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(inByte<0) { 1666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org inByte+=0x100; 1676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return nextImpl(bytes_, inByte); 1696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 1706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Traverses the trie from the current state for this input byte. 1736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 1746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Values below -0x100 and above 0xff will never match. 1756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return The match/value Result. 1766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 1776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStringTrieResult next(int32_t inByte); 1796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Traverses the trie from the current state for this byte sequence. 1826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Equivalent to 1836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * \code 1846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Result result=current(); 1856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * for(each c in s) 1866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; 1876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * result=next(c); 1886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * return result; 1896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * \endcode 1906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param s A string or byte sequence. Can be NULL if length is 0. 1916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param length The length of the byte sequence. Can be -1 if NUL-terminated. 1926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return The match/value Result. 1936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 1946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 1956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStringTrieResult next(const char *s, int32_t length); 1966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 1976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 1986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Returns a matching byte sequence's value if called immediately after 1996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. 2006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * getValue() can be called multiple times. 2016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 2026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! 2036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return The value for the byte sequence so far. 2046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 2056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org inline int32_t getValue() const { 2076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *pos=pos_; 2086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t leadByte=*pos++; 2096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // U_ASSERT(leadByte>=kMinValueLead); 2106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return readValue(pos, leadByte>>1); 2116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 2146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Determines whether all byte sequences reachable from the current state 2156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * map to the same value. 2166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param uniqueValue Receives the unique value, if this function returns TRUE. 2176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (output-only) 2186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return TRUE if all byte sequences reachable from the current state 2196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * map to the same value. 2206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 2216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org inline UBool hasUniqueValue(int32_t &uniqueValue) const { 2236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *pos=pos_; 2246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Skip the rest of a pending linear-match node. 2256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue); 2266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 2276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 2296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Finds each byte which continues the byte sequence from the current state. 2306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now. 2316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param out Each next byte is appended to this object. 2326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * (Only uses the out.Append(s, length) method.) 2336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return the number of bytes which continue the byte sequence from here 2346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 2356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t getNextBytes(ByteSink &out) const; 2376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 2396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 2406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 2416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org class U_COMMON_API Iterator : public UMemory { 2436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org public: 2446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 2456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Iterates from the root of a byte-serialized BytesTrie. 2466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param trieBytes The trie bytes. 2476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 2486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Otherwise, the iterator returns strings with this maximum length. 2496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param errorCode Standard ICU error code. Its input value must 2506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * pass the U_SUCCESS() test, or else the function returns 2516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * immediately. Check for U_FAILURE() on output or use with 2526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * function chaining. (See User Guide for details.) 2536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 2546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode); 2566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 2586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Iterates from the current state of the specified BytesTrie. 2596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param trie The trie whose state will be copied for iteration. 2606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 2616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Otherwise, the iterator returns strings with this maximum length. 2626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param errorCode Standard ICU error code. Its input value must 2636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * pass the U_SUCCESS() test, or else the function returns 2646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * immediately. Check for U_FAILURE() on output or use with 2656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * function chaining. (See User Guide for details.) 2666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 2676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); 2696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 2716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Destructor. 2726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 2736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ~Iterator(); 2756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 2776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Resets this iterator to its initial state. 2786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return *this 2796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 2806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org Iterator &reset(); 2826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 2846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return TRUE if there are more elements. 2856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 2866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 2876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool hasNext() const; 2886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 2896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 2906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Finds the next (byte sequence, value) pair if there is one. 2916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * 2926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * If the byte sequence is truncated to the maximum length and does not 2936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * have a real value, then the value is set to -1. 2946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * In this case, this "not a real value" is indistinguishable from 2956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * a real value of -1. 2966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @param errorCode Standard ICU error code. Its input value must 2976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * pass the U_SUCCESS() test, or else the function returns 2986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * immediately. Check for U_FAILURE() on output or use with 2996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * function chaining. (See User Guide for details.) 3006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return TRUE if there is another element. 3016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 3026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 3036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool next(UErrorCode &errorCode); 3046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 3066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return The NUL-terminated byte sequence for the last successful next(). 3076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 3086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 3096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const StringPiece &getString() const { return sp_; } 3106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 3116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @return The value for the last successful next(). 3126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * @stable ICU 4.8 3136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 3146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t getValue() const { return value_; } 3156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org private: 3176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool truncateAndStop(); 3186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode); 3206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *bytes_; 3226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *pos_; 3236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *initialPos_; 3246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t remainingMatchLength_; 3256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t initialRemainingMatchLength_; 3266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org CharString *str_; 3286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org StringPiece sp_; 3296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t maxLength_; 3306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t value_; 3316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The stack stores pairs of integers for backtracking to another 3336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // outbound edge of a branch node. 3346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The first integer is an offset from bytes_. 3356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The second integer has the str_->length() from before the node in bits 15..0, 3366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 3376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 3386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // but the code looks more confusing that way.) 3396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UVector32 *stack_; 3406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org }; 3416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgprivate: 3436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org friend class BytesTrieBuilder; 3446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org /** 3466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Constructs a BytesTrie reader instance. 3476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * Unlike the public constructor which just aliases an array, 3486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * this constructor adopts the builder's array. 3496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org * This constructor is only called by the builder. 3506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org */ 3516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrie(void *adoptBytes, const void *trieBytes) 3526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org : ownedArray_(static_cast<uint8_t *>(adoptBytes)), 3536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org bytes_(static_cast<const uint8_t *>(trieBytes)), 3546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos_(bytes_), remainingMatchLength_(-1) {} 3556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // No assignment operator. 3576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org BytesTrie &operator=(const BytesTrie &other); 3586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org inline void stop() { 3606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos_=NULL; 3616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Reads a compact 32-bit integer. 3646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // pos is already after the leadByte, and the lead byte is already shifted right by 1. 3656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static int32_t readValue(const uint8_t *pos, int32_t leadByte); 3666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) { 3676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // U_ASSERT(leadByte>=kMinValueLead); 3686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(leadByte>=(kMinTwoByteValueLead<<1)) { 3696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(leadByte<(kMinThreeByteValueLead<<1)) { 3706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++pos; 3716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(leadByte<(kFourByteValueLead<<1)) { 3726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=2; 3736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=3+((leadByte>>1)&1); 3756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; 3786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static inline const uint8_t *skipValue(const uint8_t *pos) { 3806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t leadByte=*pos++; 3816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return skipValue(pos, leadByte); 3826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Reads a jump delta and jumps. 3856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const uint8_t *jumpByDelta(const uint8_t *pos); 3866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 3876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static inline const uint8_t *skipDelta(const uint8_t *pos) { 3886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t delta=*pos++; 3896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(delta>=kMinTwoByteDeltaLead) { 3906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org if(delta<kMinThreeByteDeltaLead) { 3916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org ++pos; 3926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else if(delta<kFourByteDeltaLead) { 3936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=2; 3946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } else { 3956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org pos+=3+(delta&1); 3966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 3986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return pos; 3996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static inline UStringTrieResult valueResult(int32_t node) { 4026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal)); 4036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org } 4046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Handles a branch node for both next(byte) and next(string). 4066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte); 4076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Requires remainingLength_<0. 4096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte); 4106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Helper functions for hasUniqueValue(). 4126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Recursively finds a unique value (or whether there is not a unique one) 4136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // from a branch. 4146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length, 4156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org UBool haveUniqueValue, int32_t &uniqueValue); 4166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Recursively finds a unique value (or whether there is not a unique one) 4176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // starting from a position on a node lead byte. 4186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); 4196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4206f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Helper functions for getNextBytes(). 4216f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // getNextBytes() when pos is on a branch node. 4226f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out); 4236f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static void append(ByteSink &out, int c); 4246f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4256f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // BytesTrie data structure 4266f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 4276f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The trie consists of a series of byte-serialized nodes for incremental 4286f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // string/byte sequence matching. The root node is at the beginning of the trie data. 4296f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 4306f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Types of nodes are distinguished by their node lead byte ranges. 4316f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // After each node, except a final-value node, another node follows to 4326f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // encode match values or continue matching further bytes. 4336f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 4346f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Node types: 4356f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // - Value node: Stores a 32-bit integer in a compact, variable-length format. 4366f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The value is for the string/byte sequence so far. 4376f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // One node bit indicates whether the value is final or whether 4386f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // matching continues with the next node. 4396f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // - Linear-match node: Matches a number of bytes. 4406f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // - Branch node: Branches to other nodes according to the current input byte. 4416f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The node byte is the length of the branch (number of bytes to select from) 4426f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // minus 1. It is followed by a sub-node: 4436f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // - If the length is at most kMaxBranchLinearSubNodeLength, then 4446f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // there are length-1 (key, value) pairs and then one more comparison byte. 4456f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If one of the key bytes matches, then the value is either a final value for 4466f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the string/byte sequence so far, or a "jump" delta to the next node. 4476f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If the last byte matches, then matching continues with the next node. 4486f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // (Values have the same encoding as value nodes.) 4496f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // - If the length is greater than kMaxBranchLinearSubNodeLength, then 4506f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // there is one byte and one "jump" delta. 4516f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If the input byte is less than the sub-node byte, then "jump" by delta to 4526f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the next sub-node which will have a length of length/2. 4536f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // (The delta has its own compact encoding.) 4546f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Otherwise, skip the "jump" delta to the next sub-node 4556f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // which will have a length of length-length/2. 4566f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4576f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Node lead byte values. 4586f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4596f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 4606f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // the length is one more than the next byte. 4616f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4626f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // For a branch sub-node with at most this many entries, we drop down 4636f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // to a linear search. 4646f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMaxBranchLinearSubNodeLength=5; 4656f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4666f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 4676f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMinLinearMatch=0x10; 4686f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMaxLinearMatchLength=0x10; 4696f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4706f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // 20..ff: Variable-length value node. 4716f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 4726f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Then shift-right by 1 bit. 4736f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // The remaining lead byte value indicates the number of following bytes (0..4) 4746f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // and contains the value's top bits. 4756f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 4766f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // It is a final value if bit 0 is set. 4776f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kValueIsFinal=1; 4786f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4796f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 4806f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10 4816f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 4826f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4836f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 4846f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMaxTwoByteValue=0x1aff; 4856f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4866f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 4876f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kFourByteValueLead=0x7e; 4886f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4896f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // A little more than Unicode code points. (0x11ffff) 4906f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 4916f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4926f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kFiveByteValueLead=0x7f; 4936f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 4946f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Compact delta integers. 4956f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMaxOneByteDelta=0xbf; 4966f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 4976f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMinThreeByteDeltaLead=0xf0; 4986f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kFourByteDeltaLead=0xfe; 4996f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kFiveByteDeltaLead=0xff; 5006f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5016f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 5026f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 5036f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5046f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org uint8_t *ownedArray_; 5056f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5066f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Fixed value referencing the BytesTrie bytes. 5076f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *bytes_; 5086f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5096f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Iterator variables. 5106f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5116f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Pointer to next trie byte to read. NULL if no more matches. 5126f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org const uint8_t *pos_; 5136f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 5146f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org int32_t remainingMatchLength_; 5156f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org}; 5166f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5176f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.orgU_NAMESPACE_END 5186f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org 5196f31ac30b9092fd02a8c97e5216cf53f3e4fae4jshin@chromium.org#endif // __BYTESTRIE_H__ 520