1//
2//  rbbirb.h
3//
4//  Copyright (C) 2002-2008, International Business Machines Corporation and others.
5//  All Rights Reserved.
6//
7//  This file contains declarations for several classes from the
8//    Rule Based Break Iterator rule builder.
9//
10
11
12#ifndef RBBIRB_H
13#define RBBIRB_H
14
15#include "unicode/utypes.h"
16#include "unicode/uobject.h"
17#include "unicode/rbbi.h"
18#include "unicode/uniset.h"
19#include "unicode/parseerr.h"
20#include "uhash.h"
21#include "uvector.h"
22#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
23                          //    looks up references to $variables within a set.
24
25
26
27U_NAMESPACE_BEGIN
28
29class               RBBIRuleScanner;
30struct              RBBIRuleTableEl;
31class               RBBISetBuilder;
32class               RBBINode;
33class               RBBITableBuilder;
34
35
36
37//--------------------------------------------------------------------------------
38//
39//   RBBISymbolTable.    Implements SymbolTable interface that is used by the
40//                       UnicodeSet parser to resolve references to $variables.
41//
42//--------------------------------------------------------------------------------
43class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
44public:                                       //   of these structs for each entry.
45    RBBISymbolTableEntry();
46    UnicodeString          key;
47    RBBINode               *val;
48    ~RBBISymbolTableEntry();
49
50private:
51    RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
52    RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
53};
54
55
56class RBBISymbolTable : public UMemory, public SymbolTable {
57private:
58    const UnicodeString      &fRules;
59    UHashtable               *fHashTable;
60    RBBIRuleScanner          *fRuleScanner;
61
62    // These next two fields are part of the mechanism for passing references to
63    //   already-constructed UnicodeSets back to the UnicodeSet constructor
64    //   when the pattern includes $variable references.
65    const UnicodeString      ffffString;      // = "/uffff"
66    UnicodeSet              *fCachedSetLookup;
67
68public:
69    //  API inherited from class SymbolTable
70    virtual const UnicodeString*  lookup(const UnicodeString& s) const;
71    virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
72    virtual UnicodeString parseReference(const UnicodeString& text,
73                                         ParsePosition& pos, int32_t limit) const;
74
75    //  Additional Functions
76    RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
77    virtual ~RBBISymbolTable();
78
79    virtual RBBINode *lookupNode(const UnicodeString &key) const;
80    virtual void      addEntry  (const UnicodeString &key, RBBINode *val, UErrorCode &err);
81
82#ifdef RBBI_DEBUG
83    virtual void      rbbiSymtablePrint() const;
84#else
85    // A do-nothing inline function for non-debug builds.  Member funcs can't be empty
86    //  or the call sites won't compile.
87    int32_t fFakeField;
88    #define rbbiSymtablePrint() fFakeField=0;
89#endif
90
91private:
92    RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
93    RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
94};
95
96
97//--------------------------------------------------------------------------------
98//
99//  class RBBIRuleBuilder       The top-level class handling RBBI rule compiling.
100//
101//--------------------------------------------------------------------------------
102class RBBIRuleBuilder : public UMemory {
103public:
104
105    //  Create a rule based break iterator from a set of rules.
106    //  This function is the main entry point into the rule builder.  The
107    //   public ICU API for creating RBBIs uses this function to do the actual work.
108    //
109    static BreakIterator * createRuleBasedBreakIterator( const UnicodeString    &rules,
110                                    UParseError      *parseError,
111                                    UErrorCode       &status);
112
113public:
114    // The "public" functions and data members that appear below are accessed
115    //  (and shared) by the various parts that make up the rule builder.  They
116    //  are NOT intended to be accessed by anything outside of the
117    //  rule builder implementation.
118    RBBIRuleBuilder(const UnicodeString  &rules,
119                    UParseError          *parseErr,
120                    UErrorCode           &status
121        );
122
123    virtual    ~RBBIRuleBuilder();
124    char                          *fDebugEnv;        // controls debug trace output
125    UErrorCode                    *fStatus;          // Error reporting.  Keeping status
126    UParseError                   *fParseError;      //   here avoids passing it everywhere.
127    const UnicodeString           &fRules;           // The rule string that we are compiling
128
129    RBBIRuleScanner               *fScanner;         // The scanner.
130    RBBINode                      *fForwardTree;     // The parse trees, generated by the scanner,
131    RBBINode                      *fReverseTree;     //   then manipulated by subsequent steps.
132    RBBINode                      *fSafeFwdTree;
133    RBBINode                      *fSafeRevTree;
134
135    RBBINode                      **fDefaultTree;    // For rules not qualified with a !
136                                                     //   the tree to which they belong to.
137
138    UBool                         fChainRules;       // True for chained Unicode TR style rules.
139                                                     // False for traditional regexp rules.
140
141    UBool                         fLBCMNoChain;      // True:  suppress chaining of rules on
142                                                     //   chars with LineBreak property == CM.
143
144    UBool                         fLookAheadHardBreak;  // True:  Look ahead matches cause an
145                                                     // immediate break, no continuing for the
146                                                     // longest match.
147
148    RBBISetBuilder                *fSetBuilder;      // Set and Character Category builder.
149    UVector                       *fUSetNodes;       // Vector of all uset nodes.
150
151    RBBITableBuilder              *fForwardTables;   // State transition tables
152    RBBITableBuilder              *fReverseTables;
153    RBBITableBuilder              *fSafeFwdTables;
154    RBBITableBuilder              *fSafeRevTables;
155
156    UVector                       *fRuleStatusVals;  // The values that can be returned
157                                                     //   from getRuleStatus().
158
159    RBBIDataHeader                *flattenData();    // Create the flattened (runtime format)
160                                                     // data tables..
161private:
162    RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
163    RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
164};
165
166
167
168
169//----------------------------------------------------------------------------
170//
171//   RBBISetTableEl   is an entry in the hash table of UnicodeSets that have
172//                    been encountered.  The val Node will be of nodetype uset
173//                    and contain pointers to the actual UnicodeSets.
174//                    The Key is the source string for initializing the set.
175//
176//                    The hash table is used to avoid creating duplicate
177//                    unnamed (not $var references) UnicodeSets.
178//
179//                    Memory Management:
180//                       The Hash Table owns these RBBISetTableEl structs and
181//                            the key strings.  It does NOT own the val nodes.
182//
183//----------------------------------------------------------------------------
184struct RBBISetTableEl {
185    UnicodeString *key;
186    RBBINode      *val;
187};
188
189
190//----------------------------------------------------------------------------
191//
192//   RBBIDebugPrintf    Printf equivalent, for debugging output.
193//                      Conditional compilation of the implementation lets us
194//                      get rid of the stdio dependency in environments where it
195//                      is unavailable.
196//
197//----------------------------------------------------------------------------
198#ifdef RBBI_DEBUG
199#include <stdio.h>
200#define RBBIDebugPrintf printf
201#define RBBIDebugPuts puts
202#else
203#undef RBBIDebugPrintf
204#define RBBIDebugPuts(arg)
205#endif
206
207U_NAMESPACE_END
208#endif
209
210
211
212