1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4//  rbbiscan.h
5//
6//  Copyright (C) 2002-2016, International Business Machines Corporation and others.
7//  All Rights Reserved.
8//
9//  This file contains declarations for class RBBIRuleScanner
10//
11
12
13#ifndef RBBISCAN_H
14#define RBBISCAN_H
15
16#include "unicode/utypes.h"
17#include "unicode/uobject.h"
18#include "unicode/rbbi.h"
19#include "unicode/uniset.h"
20#include "unicode/parseerr.h"
21#include "uhash.h"
22#include "uvector.h"
23#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
24                          //    looks up references to $variables within a set.
25#include "rbbinode.h"
26#include "rbbirpt.h"
27
28U_NAMESPACE_BEGIN
29
30class   RBBIRuleBuilder;
31class   RBBISymbolTable;
32
33
34//--------------------------------------------------------------------------------
35//
36//  class RBBIRuleScanner does the lowest level, character-at-a-time
37//                        scanning of break iterator rules.
38//
39//                        The output of the scanner is parse trees for
40//                        the rule expressions and a list of all Unicode Sets
41//                        encountered.
42//
43//--------------------------------------------------------------------------------
44
45class RBBIRuleScanner : public UMemory {
46public:
47
48    enum {
49        kStackSize = 100            // The size of the state stack for
50    };                              //   rules parsing.  Corresponds roughly
51                                    //   to the depth of parentheses nesting
52                                    //   that is allowed in the rules.
53
54    struct RBBIRuleChar {
55        UChar32             fChar;
56        UBool               fEscaped;
57        RBBIRuleChar() : fChar(0), fEscaped(FALSE) {};
58    };
59
60    RBBIRuleScanner(RBBIRuleBuilder  *rb);
61
62
63    virtual    ~RBBIRuleScanner();
64
65    void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
66                                                    // Return false if at end.
67
68    UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
69                                                    //   Only a single character may be pushed.
70
71    void        parse();                            // Parse the rules, generating two parse
72                                                    //   trees, one each for the forward and
73                                                    //   reverse rules,
74                                                    //   and a list of UnicodeSets encountered.
75
76    /**
77     * Return a rules string without unnecessary
78     * characters.
79     */
80    static UnicodeString stripRules(const UnicodeString &rules);
81private:
82
83    UBool       doParseActions(int32_t a);
84    void        error(UErrorCode e);                   // error reporting convenience function.
85    void        fixOpStack(RBBINode::OpPrecedence p);
86                                                       //   a character.
87    void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
88
89    UChar32     nextCharLL();
90#ifdef RBBI_DEBUG
91    void        printNodeStack(const char *title);
92#endif
93    RBBINode    *pushNewNode(RBBINode::NodeType  t);
94    void        scanSet();
95
96
97    RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.
98
99    int32_t                       fScanIndex;        // Index of current character being processed
100                                                     //   in the rule input string.
101    int32_t                       fNextIndex;        // Index of the next character, which
102                                                     //   is the first character not yet scanned.
103    UBool                         fQuoteMode;        // Scan is in a 'quoted region'
104    int32_t                       fLineNum;          // Line number in input file.
105    int32_t                       fCharNum;          // Char position within the line.
106    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
107                                                     //   as a single line, not two.
108
109    RBBIRuleChar                  fC;                // Current char for parse state machine
110                                                     //   processing.
111    UnicodeString                 fVarName;          // $variableName, valid when we've just
112                                                     //   scanned one.
113
114    RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
115                                                     //   parsing.  index by p[state][char-class]
116
117    uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
118    int32_t                       fStackPtr;           //  and pops as specified in the state
119                                                       //  transition rules.
120
121    RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
122                                                           //  during the parse of a rule
123    int32_t                        fNodeStackPtr;
124
125
126    UBool                          fReverseRule;     // True if the rule currently being scanned
127                                                     //  is a reverse direction rule (if it
128                                                     //  starts with a '!')
129
130    UBool                          fLookAheadRule;   // True if the rule includes a '/'
131                                                     //   somewhere within it.
132
133    UBool                          fNoChainInRule;   // True if the current rule starts with a '^'.
134
135    RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
136                                                     //   $variable symbols.
137
138    UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
139                                                     //   the sets created while parsing rules.
140                                                     //   The key is the string used for creating
141                                                     //   the set.
142
143    UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
144                                                     //  the scanning of RBBI rules.  The
145                                                     //  indicies for these are assigned by the
146                                                     //  perl script that builds the state tables.
147                                                     //  See rbbirpt.h.
148
149    int32_t                        fRuleNum;         // Counts each rule as it is scanned.
150
151    int32_t                        fOptionStart;     // Input index of start of a !!option
152                                                     //   keyword, while being scanned.
153
154    UnicodeSet *gRuleSet_rule_char;
155    UnicodeSet *gRuleSet_white_space;
156    UnicodeSet *gRuleSet_name_char;
157    UnicodeSet *gRuleSet_name_start_char;
158
159    RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
160    RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
161};
162
163U_NAMESPACE_END
164
165#endif
166