regexcmp.h revision b13da9df870a61b11249bf741347908dbea0edd8
1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// regexcmp.h 3b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Copyright (C) 2002-2007, International Business Machines Corporation and others. 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// All Rights Reserved. 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// This file contains declarations for the class RegexCompile 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// This class is internal to the regular expression implementation. 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// For the public Regular Expression API, see the file "unicode/regex.h" 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#ifndef RBBISCAN_H 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#define RBBISCAN_H 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uobject.h" 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uniset.h" 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/parseerr.h" 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uhash.h" 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uvector.h" 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN 29b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------------------- 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// class RegexCompile Contains the regular expression compiler. 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//-------------------------------------------------------------------------------- 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic const int kStackSize = 100; // The size of the state stack for 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pattern parsing. Corresponds roughly 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to the depth of parentheses nesting 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // that is allowed in the rules. 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustruct RegexTableEl; 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruclass RegexPattern; 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruclass RegexCompile : public UMemory { 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querupublic: 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru struct RegexPatternChar { 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 fChar; 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool fQuoted; 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RegexCompile(RegexPattern *rp, UErrorCode &e); 54b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 55b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru virtual ~RegexCompile(); 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void nextChar(RegexPatternChar &c); // Get the next char from the input stream. 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru static void cleanup(); // Memory cleanup 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Categories of parentheses in pattern. 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The category is saved in the compile-time parentheses stack frame, and 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // determines the code to be generated when the matching close ) is encountered. 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru enum EParenClass { 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru plain = -1, // No special handling 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru capturing = -2, 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atomic = -3, 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lookAhead = -4, 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru negLookAhead = -5, 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru flags = -6, 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lookBehind = -7, 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lookBehindN = -8 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru }; 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruprivate: 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool doParseActions(int32_t a); 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void error(UErrorCode e); // error reporting convenience function. 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 nextCharLL(); 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 peekCharLL(); 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *scanSet(); 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *scanProp(); 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void handleCloseParen(); 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // at the top of the just completed block 93b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // or operation, and optionally ensure that 94b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // there is space to add an opcode there. 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // a reference to a UnicodeSet. 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier. 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t LoopOp); 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void literalChar(UChar32 c); // Compile a literal char 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void fixLiterals(UBool split=FALSE); // Fix literal strings. 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void insertOp(int32_t where); // Open up a slot for a new op in the 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // generated code at the specified location. 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void emitONE_CHAR(UChar32 c); // EMit a ONE_CHAR op into the compiled code, 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // taking case mode into account. 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t minMatchLength(int32_t start, 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end); 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t maxMatchLength(int32_t start, 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end); 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void matchStartType(); 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void stripNOPs(); 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru void OptDotStar(); 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode *fStatus; 116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RegexPattern *fRXPat; 117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UParseError *fParseErr; 118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Data associated with low level character scanning 121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fScanIndex; // Index of current character being processed 123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // in the rule input string. 124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fNextIndex; // Index of the next character, which 125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is the first character not yet scanned. 126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool fQuoteMode; // Scan is in a \Q...\E quoted region 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool fInBackslashQuote; // Scan is between a '\' and the following char. 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool fEOLComments; // When scan is just after '(?', inhibit #... to 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // end of line comments, in favor of (?#...) comments. 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fLineNum; // Line number in input file. 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fCharNum; // Char position within the line. 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 fLastChar; // Previous char, needed to count CR-LF 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // as a single line, not two. 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 fPeekChar; // Saved char, if we've scanned ahead. 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RegexPatternChar fC; // Current char for parse state machine 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // processing. 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Data for the state machine that parses the regular expression. 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru RegexTableEl **fStateTable; // State Transition Table for regex Rule 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parsing. index by p[state][char-class] 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint16_t fStack[kStackSize]; // State stack, holds state pushes 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fStackPtr; // and pops as specified in the state 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // transition rules. 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Data associated with the generation of the pcode for the match engine 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.) 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Always has high bit (31) set so that flag values 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // on the paren stack are distinguished from relocatable 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pcode addresses. 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fNewModeFlags; // New flags, while compiling (?i, holds state 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // until last flag is scanned. 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool fSetModeFlag; // true for (?ismx, false for (?-ismx 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fStringOpStart; // While a literal string is being scanned 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // holds the start index within RegexPattern. 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // fLiteralText where the string is being stored. 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fPatternLength; // Length of the input pattern string. 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 fParenStack; // parentheses stack. Each frame consists of 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the positions of compiled pattern operations 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // needing fixup, followed by negative value. The 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // first entry in each frame is the position of the 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // spot reserved for use when a quantifier 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // needs to add a SAVE at the start of a (block) 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The negative value (-1, -2,...) indicates 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the kind of paren that opened the frame. Some 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // need special handling on close. 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fMatchOpenParen; // The position in the compiled pattern 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the slot reserved for a state save 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // at the start of the most recently processed 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parenthesized block. 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fMatchCloseParen; // The position in the pattern of the first 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // location after the most recently processed 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parenthesized block. 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fIntervalLow; // {lower, upper} interval quantifier values. 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fIntervalUpper; // Placed here temporarily, when pattern is 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // initially scanned. Each new interval 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // encountered overwrites these values. 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // -1 for the upper interval value means none 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // was specified (unlimited occurences.) 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fNameStartPos; // Starting position of a \N{NAME} name in a 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pattern, valid while remainder of name is 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // scanned. 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru}; 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif // RBBISCAN_H 202