1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// rbbisetb.h 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Copyright (c) 2001-2005, International Business Machines 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef RBBISETB_H 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define RBBISETB_H 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uobject.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbirb.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustruct UNewTrie; 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// RBBISetBuilder Derives the character categories used by the runtime RBBI engine 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// from the Unicode Sets appearing in the source RBBI rules, and 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// creates the TRIE table used to map from Unicode to the 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// character categories. 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// RangeDescriptor 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Each of the non-overlapping character ranges gets one of these descriptors. 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// All of them are strung together in a linked list, which is kept in order 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// (by character) 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass RangeDescriptor : public UMemory { 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 fStartChar; // Start of range, unicode 32 bit value. 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 fEndChar; // End of range, unicode 32 bit value. 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fNum; // runtime-mapped input value for this range. 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UVector *fIncludesSets; // vector of the the original 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Unicode sets that include this range. 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // (Contains ptrs to uset nodes) 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor(UErrorCode &status); 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~RangeDescriptor(); 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // where appearing in the second (higher) part. 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void setDictionaryFlag(); // Check whether this range appears as part of 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the Unicode set named "dictionary" 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Starting with the rules parse tree from the scanner, 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// - Enumerate the set of UnicodeSets that are referenced 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// by the RBBI rules. 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// - compute a derived set of non-overlapping UnicodeSets 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// that will correspond to columns in the state table for 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// the RBBI execution engine. 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// - construct the trie table that maps input characters 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// to set numbers in the non-overlapping set of sets. 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass RBBISetBuilder : public UMemory { 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBISetBuilder(RBBIRuleBuilder *rb); 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~RBBISetBuilder(); 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void build(); 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void addValToSets(UVector *sets, uint32_t val); 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void addValToSet (RBBINode *usetNode, uint32_t val); 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // runtime state machine, which are the same as 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // columns in the DFA state table 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void serializeTrie(uint8_t *where); // write out the serialized Trie. 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 getFirstChar(int32_t val) const; 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // character were encountered. 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifdef RBBI_DEBUG 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void printSets(); 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void printRanges(); 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void printRangeGroups(); 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#else 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru #define printSets() 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru #define printRanges() 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru #define printRangeGroups() 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void numberSets(); 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode *fStatus; 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UNewTrie *fTrie; // The mapping TRIE that is the end result of processing 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t fTrieSize; // the Unicode Sets. 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Groups correspond to character categories - 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // groups of ranges that are in the same original UnicodeSets. 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // fGroupCount is the index of the last used group. 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // State table column 0 is not used. Column 1 is for end-of-input. 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // column 2 is for group 0. Funny counting. 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fGroupCount; 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fSawBOF; 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 131