1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3// 4// rbbiscan.h 5// 6// Copyright (C) 2002-2016, International Business Machines Corporation and others. 7// All Rights Reserved. 8// 9// This file contains declarations for class RBBIRuleScanner 10// 11 12 13#ifndef RBBISCAN_H 14#define RBBISCAN_H 15 16#include "unicode/utypes.h" 17#include "unicode/uobject.h" 18#include "unicode/rbbi.h" 19#include "unicode/uniset.h" 20#include "unicode/parseerr.h" 21#include "uhash.h" 22#include "uvector.h" 23#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 24 // looks up references to $variables within a set. 25#include "rbbinode.h" 26#include "rbbirpt.h" 27 28U_NAMESPACE_BEGIN 29 30class RBBIRuleBuilder; 31class RBBISymbolTable; 32 33 34//-------------------------------------------------------------------------------- 35// 36// class RBBIRuleScanner does the lowest level, character-at-a-time 37// scanning of break iterator rules. 38// 39// The output of the scanner is parse trees for 40// the rule expressions and a list of all Unicode Sets 41// encountered. 42// 43//-------------------------------------------------------------------------------- 44 45class RBBIRuleScanner : public UMemory { 46public: 47 48 enum { 49 kStackSize = 100 // The size of the state stack for 50 }; // rules parsing. Corresponds roughly 51 // to the depth of parentheses nesting 52 // that is allowed in the rules. 53 54 struct RBBIRuleChar { 55 UChar32 fChar; 56 UBool fEscaped; 57 RBBIRuleChar() : fChar(0), fEscaped(FALSE) {}; 58 }; 59 60 RBBIRuleScanner(RBBIRuleBuilder *rb); 61 62 63 virtual ~RBBIRuleScanner(); 64 65 void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. 66 // Return false if at end. 67 68 UBool push(const RBBIRuleChar &c); // Push (unget) one character. 69 // Only a single character may be pushed. 70 71 void parse(); // Parse the rules, generating two parse 72 // trees, one each for the forward and 73 // reverse rules, 74 // and a list of UnicodeSets encountered. 75 76 /** 77 * Return a rules string without unnecessary 78 * characters. 79 */ 80 static UnicodeString stripRules(const UnicodeString &rules); 81private: 82 83 UBool doParseActions(int32_t a); 84 void error(UErrorCode e); // error reporting convenience function. 85 void fixOpStack(RBBINode::OpPrecedence p); 86 // a character. 87 void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); 88 89 UChar32 nextCharLL(); 90#ifdef RBBI_DEBUG 91 void printNodeStack(const char *title); 92#endif 93 RBBINode *pushNewNode(RBBINode::NodeType t); 94 void scanSet(); 95 96 97 RBBIRuleBuilder *fRB; // The rule builder that we are part of. 98 99 int32_t fScanIndex; // Index of current character being processed 100 // in the rule input string. 101 int32_t fNextIndex; // Index of the next character, which 102 // is the first character not yet scanned. 103 UBool fQuoteMode; // Scan is in a 'quoted region' 104 int32_t fLineNum; // Line number in input file. 105 int32_t fCharNum; // Char position within the line. 106 UChar32 fLastChar; // Previous char, needed to count CR-LF 107 // as a single line, not two. 108 109 RBBIRuleChar fC; // Current char for parse state machine 110 // processing. 111 UnicodeString fVarName; // $variableName, valid when we've just 112 // scanned one. 113 114 RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule 115 // parsing. index by p[state][char-class] 116 117 uint16_t fStack[kStackSize]; // State stack, holds state pushes 118 int32_t fStackPtr; // and pops as specified in the state 119 // transition rules. 120 121 RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created 122 // during the parse of a rule 123 int32_t fNodeStackPtr; 124 125 126 UBool fReverseRule; // True if the rule currently being scanned 127 // is a reverse direction rule (if it 128 // starts with a '!') 129 130 UBool fLookAheadRule; // True if the rule includes a '/' 131 // somewhere within it. 132 133 UBool fNoChainInRule; // True if the current rule starts with a '^'. 134 135 RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of 136 // $variable symbols. 137 138 UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to 139 // the sets created while parsing rules. 140 // The key is the string used for creating 141 // the set. 142 143 UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during 144 // the scanning of RBBI rules. The 145 // indicies for these are assigned by the 146 // perl script that builds the state tables. 147 // See rbbirpt.h. 148 149 int32_t fRuleNum; // Counts each rule as it is scanned. 150 151 int32_t fOptionStart; // Input index of start of a !!option 152 // keyword, while being scanned. 153 154 UnicodeSet *gRuleSet_rule_char; 155 UnicodeSet *gRuleSet_white_space; 156 UnicodeSet *gRuleSet_name_char; 157 UnicodeSet *gRuleSet_name_start_char; 158 159 RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class 160 RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class 161}; 162 163U_NAMESPACE_END 164 165#endif 166