1// 2// rbbirb.h 3// 4// Copyright (C) 2002-2008, International Business Machines Corporation and others. 5// All Rights Reserved. 6// 7// This file contains declarations for several classes from the 8// Rule Based Break Iterator rule builder. 9// 10 11 12#ifndef RBBIRB_H 13#define RBBIRB_H 14 15#include "unicode/utypes.h" 16#include "unicode/uobject.h" 17#include "unicode/rbbi.h" 18#include "unicode/uniset.h" 19#include "unicode/parseerr.h" 20#include "uhash.h" 21#include "uvector.h" 22#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 23 // looks up references to $variables within a set. 24 25 26 27U_NAMESPACE_BEGIN 28 29class RBBIRuleScanner; 30struct RBBIRuleTableEl; 31class RBBISetBuilder; 32class RBBINode; 33class RBBITableBuilder; 34 35 36 37//-------------------------------------------------------------------------------- 38// 39// RBBISymbolTable. Implements SymbolTable interface that is used by the 40// UnicodeSet parser to resolve references to $variables. 41// 42//-------------------------------------------------------------------------------- 43class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one 44public: // of these structs for each entry. 45 RBBISymbolTableEntry(); 46 UnicodeString key; 47 RBBINode *val; 48 ~RBBISymbolTableEntry(); 49 50private: 51 RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class 52 RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class 53}; 54 55 56class RBBISymbolTable : public UMemory, public SymbolTable { 57private: 58 const UnicodeString &fRules; 59 UHashtable *fHashTable; 60 RBBIRuleScanner *fRuleScanner; 61 62 // These next two fields are part of the mechanism for passing references to 63 // already-constructed UnicodeSets back to the UnicodeSet constructor 64 // when the pattern includes $variable references. 65 const UnicodeString ffffString; // = "/uffff" 66 UnicodeSet *fCachedSetLookup; 67 68public: 69 // API inherited from class SymbolTable 70 virtual const UnicodeString* lookup(const UnicodeString& s) const; 71 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; 72 virtual UnicodeString parseReference(const UnicodeString& text, 73 ParsePosition& pos, int32_t limit) const; 74 75 // Additional Functions 76 RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); 77 virtual ~RBBISymbolTable(); 78 79 virtual RBBINode *lookupNode(const UnicodeString &key) const; 80 virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); 81 82#ifdef RBBI_DEBUG 83 virtual void rbbiSymtablePrint() const; 84#else 85 // A do-nothing inline function for non-debug builds. Member funcs can't be empty 86 // or the call sites won't compile. 87 int32_t fFakeField; 88 #define rbbiSymtablePrint() fFakeField=0; 89#endif 90 91private: 92 RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class 93 RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class 94}; 95 96 97//-------------------------------------------------------------------------------- 98// 99// class RBBIRuleBuilder The top-level class handling RBBI rule compiling. 100// 101//-------------------------------------------------------------------------------- 102class RBBIRuleBuilder : public UMemory { 103public: 104 105 // Create a rule based break iterator from a set of rules. 106 // This function is the main entry point into the rule builder. The 107 // public ICU API for creating RBBIs uses this function to do the actual work. 108 // 109 static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, 110 UParseError *parseError, 111 UErrorCode &status); 112 113public: 114 // The "public" functions and data members that appear below are accessed 115 // (and shared) by the various parts that make up the rule builder. They 116 // are NOT intended to be accessed by anything outside of the 117 // rule builder implementation. 118 RBBIRuleBuilder(const UnicodeString &rules, 119 UParseError *parseErr, 120 UErrorCode &status 121 ); 122 123 virtual ~RBBIRuleBuilder(); 124 char *fDebugEnv; // controls debug trace output 125 UErrorCode *fStatus; // Error reporting. Keeping status 126 UParseError *fParseError; // here avoids passing it everywhere. 127 const UnicodeString &fRules; // The rule string that we are compiling 128 129 RBBIRuleScanner *fScanner; // The scanner. 130 RBBINode *fForwardTree; // The parse trees, generated by the scanner, 131 RBBINode *fReverseTree; // then manipulated by subsequent steps. 132 RBBINode *fSafeFwdTree; 133 RBBINode *fSafeRevTree; 134 135 RBBINode **fDefaultTree; // For rules not qualified with a ! 136 // the tree to which they belong to. 137 138 UBool fChainRules; // True for chained Unicode TR style rules. 139 // False for traditional regexp rules. 140 141 UBool fLBCMNoChain; // True: suppress chaining of rules on 142 // chars with LineBreak property == CM. 143 144 UBool fLookAheadHardBreak; // True: Look ahead matches cause an 145 // immediate break, no continuing for the 146 // longest match. 147 148 RBBISetBuilder *fSetBuilder; // Set and Character Category builder. 149 UVector *fUSetNodes; // Vector of all uset nodes. 150 151 RBBITableBuilder *fForwardTables; // State transition tables 152 RBBITableBuilder *fReverseTables; 153 RBBITableBuilder *fSafeFwdTables; 154 RBBITableBuilder *fSafeRevTables; 155 156 UVector *fRuleStatusVals; // The values that can be returned 157 // from getRuleStatus(). 158 159 RBBIDataHeader *flattenData(); // Create the flattened (runtime format) 160 // data tables.. 161private: 162 RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class 163 RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class 164}; 165 166 167 168 169//---------------------------------------------------------------------------- 170// 171// RBBISetTableEl is an entry in the hash table of UnicodeSets that have 172// been encountered. The val Node will be of nodetype uset 173// and contain pointers to the actual UnicodeSets. 174// The Key is the source string for initializing the set. 175// 176// The hash table is used to avoid creating duplicate 177// unnamed (not $var references) UnicodeSets. 178// 179// Memory Management: 180// The Hash Table owns these RBBISetTableEl structs and 181// the key strings. It does NOT own the val nodes. 182// 183//---------------------------------------------------------------------------- 184struct RBBISetTableEl { 185 UnicodeString *key; 186 RBBINode *val; 187}; 188 189 190//---------------------------------------------------------------------------- 191// 192// RBBIDebugPrintf Printf equivalent, for debugging output. 193// Conditional compilation of the implementation lets us 194// get rid of the stdio dependency in environments where it 195// is unavailable. 196// 197//---------------------------------------------------------------------------- 198#ifdef RBBI_DEBUG 199#include <stdio.h> 200#define RBBIDebugPrintf printf 201#define RBBIDebugPuts puts 202#else 203#undef RBBIDebugPrintf 204#define RBBIDebugPuts(arg) 205#endif 206 207U_NAMESPACE_END 208#endif 209 210 211 212