1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// rbbiscan.h 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho// Copyright (C) 2002-2008, International Business Machines Corporation and others. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// All Rights Reserved. 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// This file contains declarations for class RBBIRuleScanner 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef RBBISCAN_H 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define RBBISCAN_H 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uobject.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/rbbi.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/parseerr.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uhash.h" 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // looks up references to $variables within a set. 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbinode.h" 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//#include "rbbitblb.h" 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass RBBIRuleBuilder; 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass RBBISymbolTable; 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//-------------------------------------------------------------------------------- 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// class RBBIRuleScanner does the lowest level, character-at-a-time 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// scanning of break iterator rules. 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// The output of the scanner is parse trees for 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// the rule expressions and a list of all Unicode Sets 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// encountered. 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//-------------------------------------------------------------------------------- 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass RBBIRuleScanner : public UMemory { 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 4885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho enum { 4985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho kStackSize = 100 // The size of the state stack for 5085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho }; // rules parsing. Corresponds roughly 5185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // to the depth of parentheses nesting 5285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho // that is allowed in the rules. 5385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru struct RBBIRuleChar { 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 fChar; 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fEscaped; 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru }; 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIRuleScanner(RBBIRuleBuilder *rb); 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru virtual ~RBBIRuleScanner(); 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void nextChar(RBBIRuleChar &c); // Get the next char from the input stream. 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Return false if at end. 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool push(const RBBIRuleChar &c); // Push (unget) one character. 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Only a single character may be pushed. 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void parse(); // Parse the rules, generating two parse 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // trees, one each for the forward and 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // reverse rules, 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and a list of UnicodeSets encountered. 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /** 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Return a rules string without unnecessary 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * characters. 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru static UnicodeString stripRules(const UnicodeString &rules); 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool doParseActions(int32_t a); 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void error(UErrorCode e); // error reporting convenience function. 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void fixOpStack(RBBINode::OpPrecedence p); 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // a character. 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL); 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 nextCharLL(); 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifdef RBBI_DEBUG 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void printNodeStack(const char *title); 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBINode *pushNewNode(RBBINode::NodeType t); 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void scanSet(); 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIRuleBuilder *fRB; // The rule builder that we are part of. 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fScanIndex; // Index of current character being processed 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // in the rule input string. 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fNextIndex; // Index of the next character, which 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // is the first character not yet scanned. 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fQuoteMode; // Scan is in a 'quoted region' 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fLineNum; // Line number in input file. 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fCharNum; // Char position within the line. 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 fLastChar; // Previous char, needed to count CR-LF 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // as a single line, not two. 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIRuleChar fC; // Current char for parse state machine 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // processing. 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeString fVarName; // $variableName, valid when we've just 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // scanned one. 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // parsing. index by p[state][char-class] 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint16_t fStack[kStackSize]; // State stack, holds state pushes 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fStackPtr; // and pops as specified in the state 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // transition rules. 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // during the parse of a rule 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fNodeStackPtr; 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fReverseRule; // True if the rule currently being scanned 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // is a reverse direction rule (if it 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // starts with a '!') 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fLookAheadRule; // True if the rule includes a '/' 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // somewhere within it. 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // $variable symbols. 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the sets created while parsing rules. 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The key is the string used for creating 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the set. 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the scanning of RBBI rules. The 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // indicies for these are assigned by the 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // perl script that builds the state tables. 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // See rbbirpt.h. 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fRuleNum; // Counts each rule as it is scanned. 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fOptionStart; // Input index of start of a !!option 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // keyword, while being scanned. 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet *gRuleSet_rule_char; 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet *gRuleSet_white_space; 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet *gRuleSet_name_char; 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeSet *gRuleSet_name_start_char; 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 163