1// 2// rbbisetb.h 3/* 4********************************************************************** 5* Copyright (c) 2001-2005, International Business Machines 6* Corporation and others. All Rights Reserved. 7********************************************************************** 8*/ 9 10#ifndef RBBISETB_H 11#define RBBISETB_H 12 13#include "unicode/utypes.h" 14#include "unicode/uobject.h" 15#include "rbbirb.h" 16#include "uvector.h" 17 18struct UNewTrie; 19 20U_NAMESPACE_BEGIN 21 22// 23// RBBISetBuilder Derives the character categories used by the runtime RBBI engine 24// from the Unicode Sets appearing in the source RBBI rules, and 25// creates the TRIE table used to map from Unicode to the 26// character categories. 27// 28 29 30// 31// RangeDescriptor 32// 33// Each of the non-overlapping character ranges gets one of these descriptors. 34// All of them are strung together in a linked list, which is kept in order 35// (by character) 36// 37class RangeDescriptor : public UMemory { 38public: 39 UChar32 fStartChar; // Start of range, unicode 32 bit value. 40 UChar32 fEndChar; // End of range, unicode 32 bit value. 41 int32_t fNum; // runtime-mapped input value for this range. 42 UVector *fIncludesSets; // vector of the the original 43 // Unicode sets that include this range. 44 // (Contains ptrs to uset nodes) 45 RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. 46 47 RangeDescriptor(UErrorCode &status); 48 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); 49 ~RangeDescriptor(); 50 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with 51 // where appearing in the second (higher) part. 52 void setDictionaryFlag(); // Check whether this range appears as part of 53 // the Unicode set named "dictionary" 54 55private: 56 RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class 57 RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class 58}; 59 60 61// 62// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. 63// 64// Starting with the rules parse tree from the scanner, 65// 66// - Enumerate the set of UnicodeSets that are referenced 67// by the RBBI rules. 68// - compute a derived set of non-overlapping UnicodeSets 69// that will correspond to columns in the state table for 70// the RBBI execution engine. 71// - construct the trie table that maps input characters 72// to set numbers in the non-overlapping set of sets. 73// 74 75 76class RBBISetBuilder : public UMemory { 77public: 78 RBBISetBuilder(RBBIRuleBuilder *rb); 79 ~RBBISetBuilder(); 80 81 void build(); 82 void addValToSets(UVector *sets, uint32_t val); 83 void addValToSet (RBBINode *usetNode, uint32_t val); 84 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the 85 // runtime state machine, which are the same as 86 // columns in the DFA state table 87 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. 88 void serializeTrie(uint8_t *where); // write out the serialized Trie. 89 UChar32 getFirstChar(int32_t val) const; 90 UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo 91 // character were encountered. 92#ifdef RBBI_DEBUG 93 void printSets(); 94 void printRanges(); 95 void printRangeGroups(); 96#else 97 #define printSets() 98 #define printRanges() 99 #define printRangeGroups() 100#endif 101 102private: 103 void numberSets(); 104 105 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. 106 UErrorCode *fStatus; 107 108 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors 109 110 UNewTrie *fTrie; // The mapping TRIE that is the end result of processing 111 uint32_t fTrieSize; // the Unicode Sets. 112 113 // Groups correspond to character categories - 114 // groups of ranges that are in the same original UnicodeSets. 115 // fGroupCount is the index of the last used group. 116 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 117 // State table column 0 is not used. Column 1 is for end-of-input. 118 // column 2 is for group 0. Funny counting. 119 int32_t fGroupCount; 120 121 UBool fSawBOF; 122 123 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class 124 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class 125}; 126 127 128 129U_NAMESPACE_END 130#endif 131