1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  rbbiscan.h
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho//  Copyright (C) 2002-2008, International Business Machines Corporation and others.
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  All Rights Reserved.
6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  This file contains declarations for class RBBIRuleScanner
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef RBBISCAN_H
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define RBBISCAN_H
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uobject.h"
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/rbbi.h"
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h"
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/parseerr.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uhash.h"
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h"
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                          //    looks up references to $variables within a set.
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbinode.h"
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//#include "rbbitblb.h"
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass   RBBIRuleBuilder;
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass   RBBISymbolTable;
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//--------------------------------------------------------------------------------
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  class RBBIRuleScanner does the lowest level, character-at-a-time
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                        scanning of break iterator rules.
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                        The output of the scanner is parse trees for
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                        the rule expressions and a list of all Unicode Sets
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                        encountered.
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//--------------------------------------------------------------------------------
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass RBBIRuleScanner : public UMemory {
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic:
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
4885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    enum {
4985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        kStackSize = 100            // The size of the state stack for
5085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    };                              //   rules parsing.  Corresponds roughly
5185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                                    //   to the depth of parentheses nesting
5285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                                    //   that is allowed in the rules.
5385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    struct RBBIRuleChar {
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UChar32             fChar;
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        UBool               fEscaped;
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    };
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBIRuleScanner(RBBIRuleBuilder  *rb);
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    virtual    ~RBBIRuleScanner();
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                    // Return false if at end.
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                    //   Only a single character may be pushed.
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void        parse();                            // Parse the rules, generating two parse
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                    //   trees, one each for the forward and
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                    //   reverse rules,
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                    //   and a list of UnicodeSets encountered.
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    /**
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * Return a rules string without unnecessary
77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     * characters.
78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru     */
79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    static UnicodeString stripRules(const UnicodeString &rules);
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate:
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool       doParseActions(int32_t a);
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void        error(UErrorCode e);                   // error reporting convenience function.
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void        fixOpStack(RBBINode::OpPrecedence p);
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                       //   a character.
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32     nextCharLL();
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifdef RBBI_DEBUG
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void        printNodeStack(const char *title);
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBINode    *pushNewNode(RBBINode::NodeType  t);
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    void        scanSet();
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t                       fScanIndex;        // Index of current character being processed
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   in the rule input string.
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t                       fNextIndex;        // Index of the next character, which
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   is the first character not yet scanned.
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool                         fQuoteMode;        // Scan is in a 'quoted region'
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t                       fLineNum;          // Line number in input file.
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t                       fCharNum;          // Char position within the line.
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UChar32                       fLastChar;         // Previous char, needed to count CR-LF
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   as a single line, not two.
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBIRuleChar                  fC;                // Current char for parse state machine
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   processing.
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeString                 fVarName;          // $variableName, valid when we've just
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   scanned one.
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   parsing.  index by p[state][char-class]
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t                       fStackPtr;           //  and pops as specified in the state
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                       //  transition rules.
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                           //  during the parse of a rule
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t                        fNodeStackPtr;
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool                          fReverseRule;     // True if the rule currently being scanned
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //  is a reverse direction rule (if it
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //  starts with a '!')
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UBool                          fLookAheadRule;   // True if the rule includes a '/'
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   somewhere within it.
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   $variable symbols.
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   the sets created while parsing rules.
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   The key is the string used for creating
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   the set.
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
14085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //  the scanning of RBBI rules.  The
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //  indicies for these are assigned by the
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //  perl script that builds the state tables.
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //  See rbbirpt.h.
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t                        fRuleNum;         // Counts each rule as it is scanned.
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t                        fOptionStart;     // Input index of start of a !!option
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                                     //   keyword, while being scanned.
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeSet *gRuleSet_rule_char;
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeSet *gRuleSet_white_space;
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeSet *gRuleSet_name_char;
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeSet *gRuleSet_name_start_char;
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru};
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
163