1b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// file: regexcmp.cpp 3b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Copyright (C) 2002-2011 International Business Machines Corporation and others. 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// All Rights Reserved. 6b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// This file contains the ICU regular expression compiler, which is responsible 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// for processing a regular expression pattern into the compiled form that 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// is used by the match finding engine. 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/ustring.h" 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/unistr.h" 18b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uniset.h" 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchar.h" 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchriter.h" 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/parsepos.h" 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/parseerr.h" 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/regex.h" 24b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "patternprops.h" 2550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "putilimp.h" 26b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cmemory.h" 27b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cstring.h" 28b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uvectr32.h" 2950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uvectr64.h" 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uassert.h" 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "ucln_in.h" 32c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uinvchar.h" 33b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "regeximp.h" 35b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "regexcst.h" // Contains state table for the regex pattern parser. 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // generated by a Perl script. 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "regexcmp.h" 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "regexst.h" 3950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "regextxt.h" 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 42b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Constructor. 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 51c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruRegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : 52c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fParenStack(status), fSetStack(status), fSetOpStack(status) 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 5450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Lazy init of all shared global sets (needed for init()'s empty text) 5550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho RegexStaticSets::initGlobals(&status); 5650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 57b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStatus = &status; 58b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 59b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat = rxp; 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fScanIndex = 0; 61807b6b36605a2970f69dc767fee84a1b2a31e5e3Elliott Hughes fLastChar = -1; 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPeekChar = -1; 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fLineNum = 1; 64b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fCharNum = 0; 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fQuoteMode = FALSE; 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fInBackslashQuote = FALSE; 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fModeFlags = fRXPat->fFlags | 0x80000000; 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fEOLComments = TRUE; 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchOpenParen = -1; 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchCloseParen = -1; 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStringOpStart = -1; 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { 75b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = rxp->fDeferredStatus; 76b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 79c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chAmp = 0x26; // '&' 80c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chDash = 0x2d; // '-' 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 83b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 84b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Destructor 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRegexCompile::~RegexCompile() { 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 91c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { 92c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); 93c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 94c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 97b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Compile regex pattern. The state machine for rexexp pattern parsing is here. 98b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The state tables are hand-written in the file regexcst.txt, 99b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// and converted to the form used here by a perl 100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// script regexcst.pl 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::compile( 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString &pat, // Source pat to be compiled. 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UParseError &pp, // Error position info 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode &e) // Error Code 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 10850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fPatternString = new UnicodeString(pat); 10950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UText patternText = UTEXT_INITIALIZER; 11050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); 11150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 11250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (U_SUCCESS(e)) { 11350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compile(&patternText, pp, e); 11450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_close(&patternText); 11550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 11650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 11750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 11850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// 11950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// compile, UText mode 12050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// All the work is actually done here. 12150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// 12250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid RegexCompile::compile( 12350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UText *pat, // Source pat to be compiled. 12450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UParseError &pp, // Error position info 12550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &e) // Error Code 12650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho{ 127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStatus = &e; 128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParseErr = &pp; 129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr = 0; 130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStack[fStackPtr] = 0; 131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // There should be no pattern stuff in the RegexPattern object. They can not be reused. 13750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0); 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Prepare the RegexPattern object to receive the compiled pattern. 14050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus); 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; 143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Initialize the pattern scanning state machine 14650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fPatternLength = utext_nativeLength(pat); 147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint16_t state = 1; 148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const RegexTableEl *tableEl; 149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextChar(fC); // Fetch the first char from the pattern string. 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Main loop for the regex pattern parsing state machine. 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Runs once per state transition. 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Each time through optionally performs, depending on the state table, 155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - an advance to the the next pattern char 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - an action to be performed. 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - pushing or popping a state to/from the local state return stack. 158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // file regexcst.txt is the source for the state table. The logic behind 159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // recongizing the pattern syntax is there, not here. 160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Bail out if anything has gone wrong. 163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Regex pattern parsing stops on the first error encountered. 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(state != 0); 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Find the state table element that matches the input char from the pattern, or the 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // class of the input character. Start with the first table row for this 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // state, then linearly scan forward until we find a row that matches the 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // character. The last row for each state always matches all characters, so 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the search will stop there, if not before. 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tableEl = &gRuleParseStateTable[state]; 177c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { // loop through table rows belonging to this state, looking for one 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // that matches the current input char. 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru REGEX_SCAN_DEBUG_PRINTF((".")); 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified an individual character, not a set, and 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the input character is not quoted, and 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the input character matched it. 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass == 255) { 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified default, match anything character class. 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass == 254 && fC.fQuoted) { 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified "quoted" and the char was quoted. 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified eof and we hit eof on the input. 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fC.fQuoted == FALSE && // char is not escaped && 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fC.fChar != (UChar32)-1) { // char is not EOF 205c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified a character class, or set of characters, 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and the current char matches it. 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No match on this row, advance to the next row for this state, 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tableEl++; 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru REGEX_SCAN_DEBUG_PRINTF(("\n")); 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've found the row of the state table that matches the current input 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // character from the rules string. 220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Perform any action specified by this row in the state table. 221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (doParseActions(tableEl->fAction) == FALSE) { 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Break out of the state machine loop if the 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the action signalled some kind of error, or 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the action was to exit, occurs on normal end-of-rules-input. 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fPushState != 0) { 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr++; 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fStackPtr >= kStackSize) { 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_INTERNAL_ERROR); 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr--; 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStack[fStackPtr] = tableEl->fPushState; 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // NextChar. This is where characters are actually fetched from the pattern. 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Happens under control of the 'n' tag in the state table. 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fNextChar) { 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextChar(fC); 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Get the next state from the table entry, or from the 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // state stack if the next state was specified as "pop". 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fNextState != 255) { 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru state = tableEl->fNextState; 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru state = fStack[fStackPtr]; 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr--; 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fStackPtr < 0) { 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // state stack underflow 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This will occur if the user pattern has mis-matched parentheses, 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // with extra close parens. 257c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr++; 259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 265c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 266c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Bail out if the pattern had errors. 267c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Set stack cleanup: a successful compile would have left it empty, 268c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // but errors can leave temporary sets hanging around. 269c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru while (!fSetStack.empty()) { 270c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete (UnicodeSet *)fSetStack.pop(); 271c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 272c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return; 273c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The pattern has now been read and processed, and the compiled code generated. 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compute the number of digits requried for the largest capture group number. 281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fMaxCaptureDigits = 1; 283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t n = 10; 28450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t groupCount = fRXPat->fGroupMap->size(); 28550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while (n <= groupCount) { 286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fMaxCaptureDigits++; 287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru n *= 10; 288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The pattern's fFrameSize so far has accumulated the requirements for 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // storage for capture parentheses, counters, etc. that are encountered 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // in the pattern. Add space for the two variables that are always 29450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // present in the saved state: the input string position (int64_t) and 29550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // the position in the compiled pattern. 29650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 29750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; 29850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 30050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Optimization pass 1: NOPs, back-references, and case-folding 30150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 30250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho stripNOPs(); 303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Get bounds for the minimum and maximum length of a string that this 306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pattern can match. Used to avoid looking for matches in strings that 307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are too short. 308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 31250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Optimization pass 2: match start type 313c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 314c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru matchStartType(); 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Set up fast latin-1 range sets 318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t numSets = fRXPat->fSets->size(); 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fSets8 = new Regex8BitSet[numSets]; 321c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Null pointer check. 322c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fRXPat->fSets8 == NULL) { 323c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru e = *fStatus = U_MEMORY_ALLOCATION_ERROR; 324c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return; 325c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i; 327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=0; i<numSets; i++) { 328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i); 329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fSets8[i].init(s); 330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// doParseAction Do some action during regex pattern parsing. 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Called by the parse state machine. 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Generation of the match engine PCode happens here, or 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// in functions called from the parse actions defined here. 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool RegexCompile::doParseActions(int32_t action) 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool returnVal = TRUE; 351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch ((Regex_PatternParseAction)action) { 353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPatStart: 355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Start of pattern compiles to: 356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //0 SAVE 2 Fall back to position of FAIL 357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //1 jmp 3 358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //2 FAIL Stop if we ever reach here. 359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //3 NOP Dummy, so start of pattern looks the same as 360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the start of an ( grouping. 361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //4 NOP Resreved, will be replaced by a save if there are 362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // OR | operators at the top level 363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus); 364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); 365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); 366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Standard open nonCapture paren action emits the two NOPs and 368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets up the paren stack frame. 369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru doParseActions(doOpenNonCaptureParen); 370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPatFinish: 373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've scanned to the end of the pattern 374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The end of pattern compiles to: 375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // URX_END 376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // which will stop the runtime match engine. 377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Encountering end of pattern also behaves like a close paren, 378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and forces fixups of the State Save at the beginning of the compiled pattern 379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and of any OR operations at the top level. 380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru handleCloseParen(); 382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fParenStack.size() > 0) { 383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Missing close paren in pattern. 384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // add the END operation to the compiled pattern. 388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); 389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Terminate the pattern compilation state machine. 391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru returnVal = FALSE; 392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOrOperator: 397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanning a '|', as in (A|B) 398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert a SAVE operation at the start of the pattern section preceding 400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this OR at this level. This SAVE will branch the match forward 401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to the right hand side of the OR in the event that the left hand 402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // side fails to match and backtracks. Locate the position for the 403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // save from the location on the top of the parentheses stack. 404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t savePosition = fParenStack.popi(); 40550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition); 406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location 407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); 408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, savePosition); 409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append an JMP operation into the compiled pattern. The operand for 411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the JMP will eventually be the location following the ')' for the 412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // group. This will be patched in later, when the ')' is encountered. 413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_JMP, 0); 414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Push the position of the newly added JMP op onto the parentheses stack. 417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This registers if for fixup when this block's close paren is encountered. 418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append a NOP to the compiled pattern. This is the slot reserved 421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for a SAVE in the event that there is yet another '|' following 422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this one. 423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenCaptureParen: 430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open Paren. 431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile to a 432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which later may be replaced by a save-state if the 433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parenthesized group gets a * quantifier, followed by 434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - START_CAPTURE n where n is stack frame offset to the capture group variables. 435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which may later be replaced by a save-state if there 436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is an '|' alternation within the parens. 437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Each capture group gets three slots in the save stack frame: 43950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 0: Capture Group start position (in input string being matched.) 44050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 1: Capture Group end position. 44150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 2: Start of Match-in-progress. 442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The first two locations are for a completed capture group, and are 443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // referred to by back references and the like. 444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The third location stores the capture start position when an START_CAPTURE is 445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // encountered. This will be promoted to a completed capture when (and if) the corresponding 44650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // END_CAPTURE is encountered. 447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame. 450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fFrameSize += 3; 451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); 452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(cop, *fStatus); 453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the two NOPs. Depending on what follows in the pattern, the 457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // NOPs may be changed to SAVE_STATE or JMP ops, with a target 458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // address of the end of the parenthesized group. 459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(capturing, *fStatus); // Frame type. 461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Save the mapping from group number to stack frame variable position. 465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenNonCaptureParen: 470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open non-caputuring (grouping only) Paren. 471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile to a 472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which later may be replaced by a save-state if the 473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parenthesized group gets a * quantifier, followed by 474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which may later be replaced by a save-state if there 475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is an '|' alternation within the parens. 476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the two NOPs. 482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(plain, *fStatus); // Begin a new frame. 484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenAtomicParen: 491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open Atomic Paren. (?> 492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile to a 493c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // - NOP, which later may be replaced if the parenthesized group 494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // has a quantifier, followed by 495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - STO_SP save state stack position, so it can be restored at the ")" 496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which may later be replaced by a save-state if there 497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is an '|' alternation within the parens. 498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fDataSize += 1; // state stack ptr. 502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); 503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(stoOp, *fStatus); 504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the two NOPs. Depending on what follows in the pattern, the 508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // NOPs may be changed to SAVE_STATE or JMP ops, with a target 509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // address of the end of the parenthesized group. 510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(atomic, *fStatus); // Frame type. 512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP 513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenLookAhead: 519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Positive Look-ahead (?= stuff ) 520c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 521c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Note: Addition of transparent input regions, with the need to 522c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // restore the original regions when failing out of a lookahead 523c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // block, complicated this sequence. Some conbined opcodes 524c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // might make sense - or might not, lookahead aren't that common. 525c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 526c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Caution: min match length optimization knows about this 527c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // sequence; don't change without making updates there too. 528c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 530c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 1 START_LA dataLoc Saves SP, Input Pos 531c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2. STATE_SAVE 4 on failure of lookahead, goto 4 532c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3 JMP 6 continue ... 533c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 534c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4. LA_END Look Ahead failed. Restore regions. 535c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 5. BACKTRACK and back track again. 536c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 537c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 6. NOP reserved for use by quantifiers on the block. 538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-ahead can't have quantifiers, but paren stack 539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compile time conventions require the slot anyhow. 540c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 7. NOP may be replaced if there is are '|' ops in the block. 541c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 8. code for parenthesized stuff. 542c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 9. LA_END 543c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Two data slots are reserved, for saving the stack ptr and the input position. 545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = fRXPat->fDataSize; 547c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fRXPat->fDataSize += 2; 548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_LA_START, dataLoc); 549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 551c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); 552c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 553c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 554c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); 555c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 556c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 557c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_BUILD(URX_LA_END, dataLoc); 558c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 559c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 560c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_BUILD(URX_BACKTRACK, 0); 561c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 562c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_NOP, 0); 564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 568c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // of the NOPs. 569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(lookAhead, *fStatus); // Frame type. 571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenLookAheadNeg: 577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Negated Lookahead. (?! stuff ) 578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. START_LA dataloc 580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, 581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // // which continues with the match. 582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. NOP // Std. Open Paren sequence, for possible '|' 583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. code for parenthesized stuff. 584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. END_LA // Cut back stack, remove saved state from step 2. 585c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. 586c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 7. END_LA // Restore match region, in case look-ahead was using 587c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // an alternate (transparent) region. 588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = fRXPat->fDataSize; 590c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fRXPat->fDataSize += 2; 591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_LA_START, dataLoc); 592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later. 595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_NOP, 0); 598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 601c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // of the StateSave and NOP. 602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 603c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fParenStack.push(negLookAhead, *fStatus); // Frame type 604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location 605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 606c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 60750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Instructions #5 - #7 will be added when the ')' is encountered. 608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenLookBehind: 612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile a (?<= look-behind open paren. 614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 0 URX_LB_START dataLoc 617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1 URX_LB_CONT dataLoc 618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2 MinMatchLen 619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3 MaxMatchLen 620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4 URX_NOP Standard '(' boilerplate. 621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5 URX_NOP Reserved slot for use with '|' ops within (block). 622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 6 <code for LookBehind expression> 623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 7 URX_LB_END dataLoc # Check match len, restore input len 624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 8 URX_LA_END dataLoc # Restore stack, input pos 625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Allocate a block of matcher data, to contain (when running a match) 627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 0: Stack ptr on entry 628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1: Input Index on entry 629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2: Start index of match current match attempt. 630c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3: Original Input String len. 631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Allocate data space 633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = fRXPat->fDataSize; 634c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fRXPat->fDataSize += 4; 635c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit URX_LB_START 637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_LB_START, dataLoc); 638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 639c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit URX_LB_CONT 641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_LB_CONT, dataLoc); 642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 645c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the NOP 647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_NOP, 0); 648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 650c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 652c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // of the URX_LB_CONT and the NOP. 653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(lookBehind, *fStatus); // Frame type 655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 657c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The final two instructions will be added when the ')' is encountered. 659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenLookBehindNeg: 664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile a (?<! negated look-behind open paren. 666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 0 URX_LB_START dataLoc # Save entry stack, input len 669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1 URX_LBN_CONT dataLoc # Iterate possible match positions 670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2 MinMatchLen 671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3 MaxMatchLen 672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4 continueLoc (9) 673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5 URX_NOP Standard '(' boilerplate. 674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 6 URX_NOP Reserved slot for use with '|' ops within (block). 675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 7 <code for LookBehind expression> 676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL 677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 9 ... 678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Allocate a block of matcher data, to contain (when running a match) 680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 0: Stack ptr on entry 681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1: Input Index on entry 682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2: Start index of match current match attempt. 683c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3: Original Input String len. 684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Allocate data space 686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = fRXPat->fDataSize; 687c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fRXPat->fDataSize += 4; 688c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit URX_LB_START 690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_LB_START, dataLoc); 691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 692c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit URX_LBN_CONT 694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_LBN_CONT, dataLoc); 695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later. 699c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the NOP 701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_NOP, 0); 702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 704c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 706c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // of the URX_LB_CONT and the NOP. 707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(lookBehindN, *fStatus); // Frame type 709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 711c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The final two instructions will be added when the ')' is encountered. 713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doConditionalExpr: 717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Conditionals such as (?(1)a:b) 718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPerlInline: 719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. 720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_UNIMPLEMENTED); 721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doCloseParen: 725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru handleCloseParen(); 726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fParenStack.size() <= 0) { 727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Extra close paren, or missing open paren. 728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNOP: 733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBadOpenParenType: 737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doRuleError: 738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_RULE_SYNTAX); 739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doMismatchedParenErr: 743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPlus: 747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Normal '+' compiles to 748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. stuff to be repeated (already built) 749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. jmp-sav 1 750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. ... 751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Or, if the item to be repeated can match a zero length string, 753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STO_INP_LOC data-loc 754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of stuff to be repeated 755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. JMP_SAV_X 2 756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. ... 757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Or, if the item to be repeated is simple 760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. Item to be repeated. 761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. LOOP_SR_I set number (assuming repeated item is a set ref) 762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. LOOP_C stack location 763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(FALSE); // location of item #1 765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t frameLoc; 766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for simple constructs, which may get special optimized code. 768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (topLoc == fRXPat->fCompiledPat->size() - 1) { 76950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_SETREF) { 772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit optimized code for [char set]+ 773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru frameLoc = fRXPat->fFrameSize; 776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fFrameSize++; 777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_DOTANY || 783c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 784c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit Optimized code for .+ operations. 786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 788c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. 789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopOpI |= 1; 790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 791c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fModeFlags & UREGEX_UNIX_LINES) { 792c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru loopOpI |= 2; 793c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru frameLoc = fRXPat->fFrameSize; 796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fFrameSize++; 797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // General case. 805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for minimum match length of zero, which requires 807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // extra loop-breaking code. 808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { 809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Zero length match is possible. 810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the code sequence that can handle it. 811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topLoc); 812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru frameLoc = fRXPat->fFrameSize; 813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fFrameSize++; 814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); 816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); 819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Simpler code when the repeated body must match something non-empty 822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); 823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNGPlus: 829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Non-greedy '+?' compiles to 830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. stuff to be repeated (already built) 831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. state-save 1 832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. ... 833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(FALSE); 835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); 836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); 837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpt: 842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Normal (greedy) ? quantifier. 843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. state save 3 845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of optional block 846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. ... 847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert the state save into the compiled pattern, and we're done. 848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveStateLoc = blockTopLoc(TRUE); 850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); 851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNGOpt: 856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Non-greedy ?? quantifier 857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compiles to 858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. jmp 4 859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of optional block 860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3 jmp 5 861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. state save 2 862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5 ... 863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This code is less than ideal, with two jmps instead of one, because we can only 864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // insert one instruction at the top of the block being iterated. 865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmp1_loc = blockTopLoc(TRUE); 867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmp2_loc = fRXPat->fCompiledPat->size(); 868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); 870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); 871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); 873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); 874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); 876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(save_op, *fStatus); 877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doStar: 882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Normal (greedy) * quantifier. 883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STATE_SAVE 4 885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of stuff being iterated over 886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. JMP_SAV 2 887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. ... 888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Or, if the body is a simple [Set], 890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. LOOP_SR_I set number 891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. LOOP_C stack location 892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ... 893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 894c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Or if this is a .* 895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. LOOP_DOT_I (. matches all mode flag) 896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. LOOP_C stack location 897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Or, if the body can match a zero-length string, to inhibit infinite loops, 899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STATE_SAVE 5 900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. STO_INP_LOC data-loc 901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. body of stuff 902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. JMP_SAV_X 2 903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. ... 904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // location of item #1, the STATE_SAVE 906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(FALSE); 907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = -1; 908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for simple *, where the construct being repeated 910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compiled to single opcode, and might be optimizable. 911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (topLoc == fRXPat->fCompiledPat->size() - 1) { 91250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_SETREF) { 915c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Emit optimized code for a [char set]* 916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru dataLoc = fRXPat->fFrameSize; 919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fFrameSize++; 920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_DOTANY || 926c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 927c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit Optimized code for .* operations. 929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. 932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopOpI |= 1; 933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 934c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { 935c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru loopOpI |= 2; 936c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru dataLoc = fRXPat->fFrameSize; 939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fFrameSize++; 940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit general case code for this * 947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The optimizations did not apply. 948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveStateLoc = blockTopLoc(TRUE); 950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); 951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for minimum match length of zero, which requires 953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // extra loop-breaking code. 954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { 955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(saveStateLoc); 956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru dataLoc = fRXPat->fFrameSize; 957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fFrameSize++; 958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); 960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); 961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); 962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 963c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Locate the position in the compiled pattern where the match will continue 965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // after completing the *. (4 or 5 in the comment above) 966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 967b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Put together the save state op store it into the compiled code. 969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); 970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. 973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNGStar: 978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Non-greedy *? quantifier 979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compiles to 980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. JMP 3 981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of stuff being iterated over 982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. STATE_SAVE 2 983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4 ... 984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. 986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. 987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); 988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); 989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); 990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); 991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntervalInit: 996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The '{' opening an interval quantifier was just scanned. 997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Init the counter varaiables that will accumulate the values as the digits 998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are scanned. 999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalLow = 0; 1000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalUpper = -1; 1001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntevalLowerDigit: 1004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned a digit from the lower value of an {lower,upper} interval 1005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t digitValue = u_charDigitValue(fC.fChar); 1007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(digitValue >= 0); 1008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalLow = fIntervalLow*10 + digitValue; 1009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalLow < 0) { 1010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_NUMBER_TOO_BIG); 1011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntervalUpperDigit: 1016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned a digit from the upper value of an {lower,upper} interval 1017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalUpper < 0) { 1019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalUpper = 0; 1020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t digitValue = u_charDigitValue(fC.fChar); 1022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(digitValue >= 0); 1023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalUpper = fIntervalUpper*10 + digitValue; 1024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalUpper < 0) { 1025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_NUMBER_TOO_BIG); 1026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntervalSame: 1031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned a single value interval like {27}. Upper = Lower. 1032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalUpper = fIntervalLow; 1033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doInterval: 1036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Finished scanning a normal {lower,upper} interval. Generate the code for it. 1037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (compileInlineInterval() == FALSE) { 1038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPossessiveInterval: 1043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. 1044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Remember the loc for the top of the block being looped over. 1046c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // (Can not reserve a slot in the compiled pattern at this time, because 1047c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // compileInterval needs to reserve also, and blockTopLoc can only reserve 1048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // once per block.) 1049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(FALSE); 1050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Produce normal looping code. 1052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Surround the just-emitted normal looping code with a STO_SP ... LD_SP 1055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // just as if the loop was inclosed in atomic parentheses. 1056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // First the STO_SP before the start of the loop 1058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topLoc); 1059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 1060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fDataSize += 1; // state stack ptr. 1061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_STO_SP, varLoc); 1062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 1063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 106450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); 1065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); 1066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopOp++; // point LoopOp after the just-inserted STO_SP 1067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->push(loopOp, *fStatus); 1068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Then the LD_SP after the end of the loop 1070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_LD_SP, varLoc); 1071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNGInterval: 1077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. 1078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); 1079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntervalError: 1082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_BAD_INTERVAL); 1083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1084b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doLiteralChar: 1086c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We've just scanned a "normal" character from the pattern, 1087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru literalChar(fC.fChar); 1088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1091c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doEscapedLiteralChar: 1092c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We've just scanned an backslashed escaped character with no 1093c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // special meaning. It represents itself. 1094c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1095c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1096c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1097c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1098c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1099c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru literalChar(fC.fChar); 1100c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1101c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doDotAny: 1104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // scanned a ".", match any single character. 1105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; 1107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_DOTALL) { 1108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_DOTANY_ALL, 0); 1109c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if (fModeFlags & UREGEX_UNIX_LINES) { 1110c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_BUILD(URX_DOTANY_UNIX, 0); 1111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_DOTANY, 0); 1113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1118c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doCaret: 1119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1120c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t op = 0; 1121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1122c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_CARET; 1123c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1124c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_CARET_M; 1125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1126c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_CARET; // Only testing true start of input. 1127c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1128c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_CARET_M_UNIX; 1129c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doDollar: 1135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1136c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t op = 0; 1137c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_DOLLAR; 1139c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1140c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_DOLLAR_M; 1141c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_DOLLAR_D; 1143c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1144c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_DOLLAR_MD; 1145c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashA: 1151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); 1152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashB: 1155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru #if UCONFIG_NO_BREAK_ITERATION==1 1157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_UWORD) { 1158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_UNSUPPORTED_ERROR); 1159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru #endif 1161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); 1163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashb: 1167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru #if UCONFIG_NO_BREAK_ITERATION==1 1169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_UWORD) { 1170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_UNSUPPORTED_ERROR); 1171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru #endif 1173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashD: 1179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus); 1180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashd: 1183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus); 1184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashG: 1187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus); 1188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashS: 1191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement( 1192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); 1193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashs: 1196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement( 1197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); 1198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashW: 1201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement( 1202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); 1203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashw: 1206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement( 1207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); 1208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashX: 1211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus); 1212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashZ: 1216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); 1217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashz: 1220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus); 1221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doEscapeError: 1224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doExit: 1228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru returnVal = FALSE; 1229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doProperty: 1232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *theSet = scanProp(); 1234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru compileSet(theSet); 1235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1238c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doNamedChar: 1239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1240c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c = scanNamedChar(); 1241c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru literalChar(c); 1242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1244c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackRef: 1247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // BackReference. Somewhat unusual in that the front-end can not completely parse 1248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the regular expression, because the number of digits to be consumed 1249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // depends on the number of capture groups that have been defined. So 1250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we have to do it here instead. 1251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t numCaptureGroups = fRXPat->fGroupMap->size(); 1253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t groupNum = 0; 1254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = fC.fChar; 1255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 1257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop once per digit, for max allowed number of digits in a back reference. 1258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t digit = u_charDigitValue(c); 1259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru groupNum = groupNum * 10 + digit; 1260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (groupNum >= numCaptureGroups) { 1261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = peekCharLL(); 1264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) { 1265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextCharLL(); 1268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scan of the back reference in the source regexp is complete. Now generate 1271c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the compiled code for it. 1272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Because capture groups can be forward-referenced by back-references, 1273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we fill the operand with the capture group number. At the end 1274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of compilation, it will be changed to the variable's location. 1275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(groupNum > 0); 1276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; 1277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_BACKREF_I, groupNum); 1279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_BACKREF, groupNum); 1281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPossessivePlus: 1288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Possessive ++ quantifier. 1289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 1290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STO_SP 1291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of stuff being iterated over 1292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. STATE_SAVE 5 1293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. JMP 2 1294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. LD_SP 1295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 6. ... 1296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note: TODO: This is pretty inefficient. A mass of saved state is built up 1298c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 1299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the STO_SP 1302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(TRUE); 1303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stoLoc = fRXPat->fDataSize; 1304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 1307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the STATE_SAVE 1309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); 1310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1311c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the JMP 1313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_JMP, topLoc+1); 1314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the LD_SP 1317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_LD_SP, stoLoc); 1318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPossessiveStar: 1323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Possessive *+ quantifier. 1324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 1325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STO_SP loc 1326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. STATE_SAVE 5 1327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. body of stuff being iterated over 1328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. JMP 2 1329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. LD_SP loc 1330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 6 ... 1331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: do something to cut back the state stack each time through the loop. 1332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Reserve two slots at the top of the block. 1334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(TRUE); 1335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topLoc); 1336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // emit STO_SP loc 1338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stoLoc = fRXPat->fDataSize; 1339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 1342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the SAVE_STATE 5 1344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t L7 = fRXPat->fCompiledPat->size()+1; 1345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STATE_SAVE, L7); 1346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1348c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Append the JMP operation. 1349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_JMP, topLoc+1); 1350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the LD_SP loc 1353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_LD_SP, stoLoc); 1354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPossessiveOpt: 1359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Possessive ?+ quantifier. 1360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 1361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STO_SP loc 1362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. SAVE_STATE 5 1363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. body of optional block 1364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. LD_SP loc 1365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. ... 1366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Reserve two slots at the top of the block. 1369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(TRUE); 1370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topLoc); 1371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the STO_SP 1373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stoLoc = fRXPat->fDataSize; 1374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 1377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the SAVE_STATE 1379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 1380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STATE_SAVE, continueLoc); 1381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the LD_SP 1384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_LD_SP, stoLoc); 1385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBeginMatchMode: 1391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNewModeFlags = fModeFlags; 1392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSetModeFlag = TRUE; 1393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doMatchMode: // (?i) and similar 1396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t bit = 0; 1398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (fC.fChar) { 1399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; 1400c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; 1401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; 1402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; 1403c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; 1404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; 1405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; 1406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x2d: /* '-' */ fSetModeFlag = FALSE; break; 1407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 1408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); // Should never happen. Other chars are filtered out 1409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // by the scanner. 1410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSetModeFlag) { 1412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNewModeFlags |= bit; 1413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNewModeFlags &= ~bit; 1415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doSetMatchMode: 1420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've got a (?i) or similar. The match mode is being changed, but 1421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the change is not scoped to a parenthesized block. 1422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fNewModeFlags < 0); 1423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fModeFlags = fNewModeFlags; 1424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Prevent any string from spanning across the change of match mode. 1426c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef" 1427c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fixLiterals(); 1428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doMatchModeParen: 1432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've got a (?i: or similar. Begin a parenthesized block, save old 1433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // mode flags so they can be restored at the close of the block. 1434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile to a 1436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which later may be replaced by a save-state if the 1437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parenthesized group gets a * quantifier, followed by 1438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which may later be replaced by a save-state if there 1439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is an '|' alternation within the parens. 1440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 1445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the two NOPs (a normal non-capturing () frame, except for the 1446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // saving of the orignal mode flags.) 1447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); 1448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(flags, *fStatus); // Frame Marker 1449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP 1450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Set the current mode flags to the new values. 1453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fNewModeFlags < 0); 1454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fModeFlags = fNewModeFlags; 1455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBadModeFlag: 1459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_INVALID_FLAG); 1460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doSuppressComments: 1463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We have just scanned a '(?'. We now need to prevent the character scanner from 1464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // treating a '#' as a to-the-end-of-line comment. 1465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (This Perl compatibility just gets uglier and uglier to do...) 1466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fEOLComments = FALSE; 1467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1470c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetAddAmp: 1471c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1472c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1473c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(chAmp); 1474c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1475c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1476c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1477c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetAddDash: 1478c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1479c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1480c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(chDash); 1481c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1482c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1483c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1484c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_s: 1485c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1486c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1487c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1488c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1489c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1490c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1491c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_S: 1492c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1493c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1494c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1495c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru SSet.complement(); 1496c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(SSet); 1497c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1498c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1499c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1500c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_d: 1501c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1502c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1503c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO - make a static set, ticket 6058. 1504c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, *fStatus); 1505c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1506c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1507c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1508c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_D: 1509c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1510c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1511c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet digits; 1512c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO - make a static set, ticket 6058. 1513c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 1514c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru digits.complement(); 1515c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(digits); 1516c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1517c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1518c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1519c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_w: 1520c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1521c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1522c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1523c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1524c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1525c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1526c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_W: 1527c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1528c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1529c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1530c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru SSet.complement(); 1531c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(SSet); 1532c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1533c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1534c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1535c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBegin: 1536c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.push(new UnicodeSet(), *fStatus); 1537c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setStart, *fStatus); 1538c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1539c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1540c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1541c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1542c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1543c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBeginDifference1: 1544c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned something like [[abc]-[ 1545c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Set up a new UnicodeSet for the set beginning with the just-scanned '[' 1546c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Push a Difference operator, which will cause the new set to be subtracted from what 1547c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // went before once it is created. 1548c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setDifference1); 1549c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setStart, *fStatus); 1550c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1551c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1552c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1553c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1554c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1555c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBeginIntersection1: 1556c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned something like [[abc]&[ 1557c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Need both the '&' operator and the open '[' operator. 1558c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setIntersection1); 1559c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setStart, *fStatus); 1560c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1561c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1562c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1563c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1564c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1565c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBeginUnion: 1566c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned something like [[abc][ 1567c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Need to handle the union operation explicitly [[abc] | [ 1568c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setUnion); 1569c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setStart, *fStatus); 1570c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1571c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1572c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1573c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1574c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1575c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetDifference2: 1576c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned something like [abc-- 1577c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Consider this to unambiguously be a set difference operator. 1578c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setDifference2); 1579c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1580c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1581c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetEnd: 1582c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Have encountered the ']' that closes a set. 1583c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Force the evaluation of any pending operations within this set, 1584c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // leave the completed set on the top of the set stack. 1585c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(setEnd); 1586b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru U_ASSERT(fSetOpStack.peeki()==setStart); 1587b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru fSetOpStack.popi(); 1588c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1589c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1590c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetFinish: 1591c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1592c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Finished a complete set expression, including all nested sets. 1593c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The close bracket has already triggered clearing out pending set operators, 1594c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the operator stack should be empty and the operand stack should have just 1595c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // one entry, the result set. 1596c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fSetOpStack.empty()); 1597c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); 1598c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fSetStack.empty()); 1599c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru compileSet(theSet); 1600c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1601c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1602c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1603c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetIntersection2: 1604c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Have scanned something like [abc&& 1605c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setIntersection2); 1606c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1607c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1608c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetLiteral: 1609c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Union the just-scanned literal character into the set being built. 1610c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This operation is the highest precedence set operation, so we can always do 1611c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // it immediately, without waiting to see what follows. It is necessary to perform 1612c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // any pending '-' or '&' operation first, because these have the same precedence 1613c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // as union-ing in a literal' 1614c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1615c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(setUnion); 1616c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1617c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(fC.fChar); 1618c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastSetLiteral = fC.fChar; 1619c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1620c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1621c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1622c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetLiteralEscaped: 1623c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // A back-slash escaped literal character was encountered. 1624c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Processing is the same as with setLiteral, above, with the addition of 1625c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the optional check for errors on escaped ASCII letters. 1626c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1627c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1628c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1629c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1630c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1631c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1632c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(setUnion); 1633c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1634c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(fC.fChar); 1635c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastSetLiteral = fC.fChar; 1636c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1637c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1638c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1639c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetNamedChar: 1640c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scanning a \N{UNICODE CHARACTER NAME} 1641c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Aside from the source of the character, the processing is identical to doSetLiteral, 1642c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // above. 1643c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1644c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c = scanNamedChar(); 1645c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(setUnion); 1646c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1647c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(c); 1648c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastSetLiteral = c; 1649c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1650c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1651c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1652c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetNamedRange: 1653c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned literal-\N{CHAR NAME}. Add the range to the set. 1654c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The left character is already in the set, and is saved in fLastSetLiteral. 1655c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The right side needs to be picked up, the scan is at the 'N'. 1656c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Lower Limit > Upper limit being an error matches both Java 1657c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // and ICU UnicodeSet behavior. 1658c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1659c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c = scanNamedChar(); 1660c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) { 1661c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_INVALID_RANGE); 1662c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1663c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1664c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(fLastSetLiteral, c); 1665c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastSetLiteral = c; 1666c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1667c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1668c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1669c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 167050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case doSetNegate: 1671c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scanned a '^' at the start of a set. 1672c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Push the negation operator onto the set op stack. 1673c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // A twist for case-insensitive matching: 1674c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the case closure operation must happen _before_ negation. 1675c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // But the case closure operation will already be on the stack if it's required. 1676c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This requires checking for case closure, and swapping the stack order 1677c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // if it is present. 1678c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1679c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t tosOp = fSetOpStack.peeki(); 1680c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (tosOp == setCaseClose) { 1681c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.popi(); 1682c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setNegation, *fStatus); 1683c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1684c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 1685c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setNegation, *fStatus); 1686c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1687c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1688c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1689c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1690c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetNoCloseError: 1691c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_MISSING_CLOSE_BRACKET); 1692c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1693c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1694c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetOpError: 1695c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal. 1696c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1697c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1698c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetPosixProp: 1699c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1700c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = scanPosixProp(); 1701c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (s != NULL) { 1702c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1703c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru tos->addAll(*s); 1704c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete s; 1705c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } // else error. scanProp() reported the error status already. 1706c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1707c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1708c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1709c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetProp: 1710c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scanned a \p \P within [brackets]. 1711c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1712c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = scanProp(); 1713c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (s != NULL) { 1714c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1715c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru tos->addAll(*s); 1716c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete s; 1717c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } // else error. scanProp() reported the error status already. 1718c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1719c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1720c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1721c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1722c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetRange: 1723c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned literal-literal. Add the range to the set. 1724c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The left character is already in the set, and is saved in fLastSetLiteral. 1725c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The right side is the current character. 1726c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Lower Limit > Upper limit being an error matches both Java 1727c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // and ICU UnicodeSet behavior. 1728c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1729c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fLastSetLiteral > fC.fChar) { 1730c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_INVALID_RANGE); 1731c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1732c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1733c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(fLastSetLiteral, fC.fChar); 1734c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1735c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1736c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 1739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 1740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_INTERNAL_ERROR); 1741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 1745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru returnVal = FALSE; 1746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return returnVal; 1749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// literalChar We've encountered a literal character from the pattern, 1756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// or an escape sequence that reduces to a character. 1757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Add it to the string containing all literal chars/strings from 1758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// the pattern. 1759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// If we are in a pattern string already, add the new char to it. 1760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// If we aren't in a pattern string, begin one now. 1761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::literalChar(UChar32 c) { 1764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; // An operation in the compiled pattern. 1765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType; 1766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patternLoc; // A position in the compiled pattern. 1767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringLen; 1768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the last thing compiled into the pattern was not a literal char, 1771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // force this new literal char to begin a new string, and not append to the previous. 177250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->lastElementi(); 1773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType = URX_TYPE(op); 1774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!(opType == URX_STRING_LEN || opType == URX_ONECHAR || opType == URX_ONECHAR_I)) { 1775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fixLiterals(); 1776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fStringOpStart == -1) { 1779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // First char of a string in the pattern. 1780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit a OneChar op into the compiled pattern. 1781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru emitONE_CHAR(c); 178250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 178350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Mark that we might actually be starting a string here 1784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStringOpStart = fRXPat->fLiteralText.length(); 1785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1787c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 178850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->lastElementi(); 1789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType = URX_TYPE(op); 1790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN); 1791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1792c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If the most recently emitted op is a URX_ONECHAR, 1793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) { 1794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) { 1795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The most recently emitted op is a ONECHAR that was the first half 1796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of a surrogate pair. Update the ONECHAR's operand to be the 1797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // supplementary code point resulting from both halves of the pair. 1798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = U16_GET_SUPPLEMENTARY(URX_VAL(op), c); 1799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(opType, c); 1800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternLoc = fRXPat->fCompiledPat->size() - 1; 1801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, patternLoc); 1802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 180450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The most recently emitted op is a ONECHAR. 1806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've now received another adjacent char. Change the ONECHAR op 1807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to a string op. 180850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fLiteralText.append(URX_VAL(op)); 180950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STRING_I, fStringOpStart); 1812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STRING, fStringOpStart); 1814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternLoc = fRXPat->fCompiledPat->size() - 1; 1816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, patternLoc); 1817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STRING_LEN, 0); 1818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 182050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 182150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // We are adding onto an existing string 182250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fLiteralText.append(c); 182350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The pattern contains a URX_SRING / URX_STRING_LEN. Update the 1825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // string length to reflect the new char we just added to the string. 1826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru stringLen = fRXPat->fLiteralText.length() - fStringOpStart; 1827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STRING_LEN, stringLen); 1828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patternLoc = fRXPat->fCompiledPat->size() - 1; 1829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, patternLoc); 1830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// emitONE_CHAR emit a ONE_CHAR op into the generated code. 1837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Choose cased or uncased version, depending on the 1838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// match mode and whether the character itself is cased. 1839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::emitONE_CHAR(UChar32 c) { 1842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; 1843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 1844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 1845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We have a cased character, and are in case insensitive matching mode. 184650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho //c = u_foldCase(c, U_FOLD_CASE_DEFAULT); // !!!: handled in stripNOPs() now 1847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_ONECHAR_I, c); 1848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Uncased char, or case sensitive match mode. 1850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Either way, just generate a literal compare of the char. 1851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_ONECHAR, c); 1852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 1854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// fixLiterals When compiling something that can follow a literal 1860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// string in a pattern, we need to "fix" any preceding 1861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// string, which will cause any subsequent literals to 1862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// begin a new string, rather than appending to the 1863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// old one. 1864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Optionally, split the last char of the string off into 1866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// a single "ONE_CHAR" operation, so that quantifiers can 1867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// apply to that char alone. Example: abc* 1868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The * must apply to the 'c' only. 1869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::fixLiterals(UBool split) { 1872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringStart = fStringOpStart; // start index of the current literal string 1873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; // An op from/for the compiled pattern. 1874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType; // An opcode type from the compiled pattern. 1875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringLastCharIdx; 1876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 lastChar; 1877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringNextToLastCharIdx; 1878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 nextToLastChar; 1879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringLen; 1880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1881c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fStringOpStart = -1; 1882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (!split) { 1883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Split: We need to ensure that the last item in the compiled pattern does 1887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // not refer to a literal string of more than one char. If it does, 1888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // separate the last char from the rest of the string. 1889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the last operation from the compiled pattern is not a string, 1891c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // nothing needs to be done 189250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->lastElementi(); 1893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType = URX_TYPE(op); 1894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (opType != URX_STRING_LEN) { 1895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru stringLen = URX_VAL(op); 1898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Find the position of the last code point in the string (might be a surrogate pair) 1901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru stringLastCharIdx = fRXPat->fLiteralText.length(); 1903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru stringLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); 1904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru lastChar = fRXPat->fLiteralText.char32At(stringLastCharIdx); 1905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The string should always be at least two code points long, meaning that there 1907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // should be something before the last char position that we just found. 1908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(stringLastCharIdx > stringStart); 1909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru stringNextToLastCharIdx = fRXPat->fLiteralText.moveIndex32(stringLastCharIdx, -1); 1910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(stringNextToLastCharIdx >= stringStart); 1911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextToLastChar = fRXPat->fLiteralText.char32At(stringNextToLastCharIdx); 1912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (stringNextToLastCharIdx > stringStart) { 1914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The length of string remaining after removing one char is two or more. 1915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Leave the string in the compiled pattern, shorten it by one char, 1916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and append a URX_ONECHAR op for the last char. 1917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru stringLen -= (fRXPat->fLiteralText.length() - stringLastCharIdx); 1918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_STRING_LEN, stringLen); 1919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, fRXPat->fCompiledPat->size() -1); 1920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru emitONE_CHAR(lastChar); 1921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The original string consisted of exactly two characters. Replace 1923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the existing compiled URX_STRING/URX_STRING_LEN ops with a pair 1924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of URX_ONECHARs. 1925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setSize(fRXPat->fCompiledPat->size() -2); 1926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru emitONE_CHAR(nextToLastChar); 1927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru emitONE_CHAR(lastChar); 1928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// insertOp() Insert a slot for a new opcode into the already 1939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// compiled pattern code. 1940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Fill the slot with a NOP. Our caller will replace it 1942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// with what they really wanted. 1943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::insertOp(int32_t where) { 194650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UVector64 *code = fRXPat->fCompiledPat; 1947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(where>0 && where < code->size()); 1948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t nop = URX_BUILD(URX_NOP, 0); 1950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru code->insertElementAt(nop, where, *fStatus); 1951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Walk through the pattern, looking for any ops with targets that 1953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // were moved down by the insert. Fix them. 1954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; 1955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=0; loc<code->size(); loc++) { 195650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)code->elementAti(loc); 1957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType = URX_TYPE(op); 1958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opValue = URX_VAL(op); 1959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((opType == URX_JMP || 1960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_JMPX || 1961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_STATE_SAVE || 1962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_CTR_LOOP || 1963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_CTR_LOOP_NG || 1964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_JMP_SAV || 1965c53bf83a40a6888f5b246a73f13f6c919de1f5f9claireho opType == URX_JMP_SAV_X || 1966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_RELOC_OPRND) && opValue > where) { 1967b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Target location for this opcode is after the insertion point and 1968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // needs to be incremented to adjust for the insertion. 1969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opValue++; 1970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(opType, opValue); 1971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru code->setElementAt(op, loc); 1972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Now fix up the parentheses stack. All positive values in it are locations in 1976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) 1977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=0; loc<fParenStack.size(); loc++) { 1978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t x = fParenStack.elementAti(loc); 1979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(x < code->size()); 1980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (x>where) { 1981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru x++; 1982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.setElementAt(x, loc); 1983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fMatchCloseParen > where) { 1987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchCloseParen++; 1988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fMatchOpenParen > where) { 1990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchOpenParen++; 1991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// blockTopLoc() Find or create a location in the compiled pattern 1999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// at the start of the operation or block that has 2000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// just been compiled. Needed when a quantifier (* or 2001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// whatever) appears, and we need to add an operation 2002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// at the start of the thing being quantified. 2003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// (Parenthesized Blocks) have a slot with a NOP that 2005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// is reserved for this purpose. .* or similar don't 2006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// and a slot needs to be added. 2007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// parameter reserveLoc : TRUE - ensure that there is space to add an opcode 2009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// at the returned location. 2010c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// FALSE - just return the address, 2011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// do not reserve a location there. 2012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t RegexCompile::blockTopLoc(UBool reserveLoc) { 2015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t theLoc; 2016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fRXPat->fCompiledPat->size() == fMatchCloseParen) 2017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The item just processed is a parenthesized block. 2019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru theLoc = fMatchOpenParen; // A slot is already reserved for us. 2020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(theLoc > 0); 2021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); 2022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Item just compiled is a single thing, a ".", or a single char, or a set reference. 2025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No slot for STATE_SAVE was pre-reserved in the compiled code. 2026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We need to make space now. 2027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fixLiterals(TRUE); // If last item was a string, separate the last char. 2028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru theLoc = fRXPat->fCompiledPat->size()-1; 2029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (reserveLoc) { 2030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru /*int32_t opAtTheLoc = fRXPat->fCompiledPat->elementAti(theLoc);*/ 2031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t nop = URX_BUILD(URX_NOP, 0); 2032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); 2033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return theLoc; 2036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// handleCloseParen When compiling a close paren, we need to go back 2043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// and fix up any JMP or SAVE operations within the 2044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// parenthesized block that need to target the end 2045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// of the block. The locations of these are kept on 2046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// the paretheses stack. 2047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// This function is called both when encountering a 2049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// real ) and at the end of the pattern. 2050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::handleCloseParen() { 2053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patIdx; 2054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patOp; 2055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fParenStack.size() <= 0) { 2056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 2057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Force any literal chars that may follow the close paren to start a new string, 2061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and not attach to any preceding it. 2062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fixLiterals(FALSE); 2063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Fixup any operations within the just-closed parenthesized group 2065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // that need to reference the end of the (block). 2066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (The first one popped from the stack is an unused slot for 2067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // alternation (OR) state save, but applying the fixup to it does no harm.) 2068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 2069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patIdx = fParenStack.popi(); 2070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (patIdx < 0) { 2071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // value < 0 flags the start of the frame on the paren stack. 2072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); 207550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx); 2076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. 2077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patOp |= fRXPat->fCompiledPat->size(); // Set it now. 2078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(patOp, patIdx); 2079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchOpenParen = patIdx; 2080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // At the close of any parenthesized block, restore the match mode flags to 2083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the value they had at the open paren. Saved value is 2084c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // at the top of the paren stack. 2085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fModeFlags = fParenStack.popi(); 2086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fModeFlags < 0); 2087c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // DO any additional fixups, depending on the specific kind of 2089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parentesized grouping this is 2090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (patIdx) { 2092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case plain: 2093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case flags: 2094b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No additional fixups required. 2095b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (Grouping-only parentheses) 2096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case capturing: 2098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Capturing Parentheses. 2099b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert a End Capture op into the pattern. 2100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The frame offset of the variables for this cg is obtained from the 2101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // start capture op and put it into the end-capture op. 2102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 210350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); 2105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t frameVarLocation = URX_VAL(captureOp); 2107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation); 2108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); 2109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case atomic: 2112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Atomic Parenthesis. 2113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert a LD_SP operation to restore the state stack to the position 2114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // it was when the atomic parens were entered. 2115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 211650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); 2118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stoLoc = URX_VAL(stoOp); 2119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); 2120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(ldOp, *fStatus); 2121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case lookAhead: 2125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 212650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = URX_VAL(startOp); 2129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 2131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case negLookAhead: 2135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See comment at doOpenLookAheadNeg 213750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); 2138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = URX_VAL(startOp); 2140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 2142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru op = URX_BUILD(URX_BACKTRACK, 0); 2143c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 214450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = URX_BUILD(URX_LA_END, dataLoc); 2145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 2146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Patch the URX_SAVE near the top of the block. 2148c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The destination of the SAVE is the final LA_END that was just added. 214950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen); 2150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); 2151c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t dest = fRXPat->fCompiledPat->size()-1; 2152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru saveOp = URX_BUILD(URX_STATE_SAVE, dest); 2153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); 2154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case lookBehind: 2158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See comment at doOpenLookBehind. 2160c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append the URX_LB_END and URX_LA_END to the compiled pattern. 216250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); 2163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = URX_VAL(startOp); 2165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_LB_END, dataLoc); 2166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 2167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_LA_END, dataLoc); 2168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 2169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Determine the min and max bounds for the length of the 2171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // string that the pattern can match. 2172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // An unbounded upper limit is an error. 2173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (maxML == INT32_MAX) { 2177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_LOOK_BEHIND_LIMIT); 2178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(minML <= maxML); 2181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert the min and max match len bounds into the URX_LB_CONT op that 2183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // appears at the top of the look-behind block, at location fMatchOpenParen+1 2184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); 2185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); 2186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case lookBehindN: 2193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See comment at doOpenLookBehindNeg. 2195c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append the URX_LBN_END to the compiled pattern. 219750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = URX_VAL(startOp); 2200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(URX_LBN_END, dataLoc); 2201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 2202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Determine the min and max bounds for the length of the 2204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // string that the pattern can match. 2205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // An unbounded upper limit is an error. 2206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (maxML == INT32_MAX) { 2210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_LOOK_BEHIND_LIMIT); 2211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(minML <= maxML); 2214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert the min and max match len bounds into the URX_LB_CONT op that 2216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // appears at the top of the look-behind block, at location fMatchOpenParen+1 2217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); 2218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); 2219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert the pattern location to continue at after a successful match 2221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // as the last operand of the URX_LBN_CONT 2222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); 2223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); 2224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 2230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 2231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // remember the next location in the compiled pattern. 2234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The compilation of Quantifiers will look at this to see whether its looping 2235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // over a parenthesized block or a single item 2236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchCloseParen = fRXPat->fCompiledPat->size(); 2237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// compileSet Compile the pattern operations for a reference to a 2244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// UnicodeSet. 2245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::compileSet(UnicodeSet *theSet) 2248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (theSet == NULL) { 2250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2252c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Remove any strings from the set. 2253c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // There shoudn't be any, but just in case. 2254c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // (Case Closure can add them; if we had a simple case closure avaialble that 2255c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // ignored strings, that would be better.) 2256c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru theSet->removeAllStrings(); 2257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t setSize = theSet->size(); 2258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (setSize) { 2260c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case 0: 2261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2262c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Set of no elements. Always fails to match. 2263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus); 2264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete theSet; 2265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2267c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 1: 2269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The set contains only a single code point. Put it into 2271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the compiled pattern as a single char operation rather 2272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // than a set, and discard the set itself. 227350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho literalChar(theSet->charAt(0)); 2274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete theSet; 2275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2277c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2278c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru default: 2279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The set contains two or more chars. (the normal case) 2281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Put it into the compiled pattern as a set. 2282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t setNumber = fRXPat->fSets->size(); 2283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fSets->addElement(theSet, *fStatus); 2284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t setOp = URX_BUILD(URX_SETREF, setNumber); 2285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(setOp, *fStatus); 2286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// compileInterval Generate the code for a {min, max} style interval quantifier. 2294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Except for the specific opcodes used, the code is the same 2295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// for all three types (greedy, non-greedy, possessive) of 2296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// intervals. The opcodes are supplied as parameters. 2297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The code for interval loops has this form: 2299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 0 CTR_INIT counter loc (in stack frame) 2300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1 5 patt address of CTR_LOOP at bottom of block 2301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2 min count 2302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3 max count (-1 for unbounded) 2303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4 ... block to be iterated over 2304c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 5 CTR_LOOP 2305c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 2306c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// In 2307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) 2309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The CTR_INIT op at the top of the block with the {n,m} quantifier takes 2311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // four slots in the compiled code. Reserve them. 2312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topOfBlock = blockTopLoc(TRUE); 2313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topOfBlock); 2314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topOfBlock); 2315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topOfBlock); 2316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The operands for the CTR_INIT opcode include the index in the matcher data 2318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the counter. Allocate it now. 2319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t counterLoc = fRXPat->fFrameSize; 2320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fFrameSize++; 2321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = URX_BUILD(InitOp, counterLoc); 2323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topOfBlock); 2324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The second operand of CTR_INIT is the location following the end of the loop. 2326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the 2327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compilation of something later on causes the code to grow and the target 2328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // position to move. 2329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopEnd = fRXPat->fCompiledPat->size(); 2330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(URX_RELOC_OPRND, loopEnd); 2331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); 2332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Followed by the min and max counts. 2334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); 2335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); 2336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. 2338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Goes at end of the block being looped over, so just append to the code so far. 2339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(LoopOp, topOfBlock); 2340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 2341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fIntervalLow & 0xff000000) != 0 || 234327f654740f2a26ad62a5c155af9199af9e69b889claireho (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { 2344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_NUMBER_TOO_BIG); 2345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { 2348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MAX_LT_MIN); 2349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool RegexCompile::compileInlineInterval() { 2355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { 2356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Too big to inline. Fail, which will cause looping code to be generated. 2357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (Upper < Lower picks up unbounded upper and errors, both.) 2358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topOfBlock = blockTopLoc(FALSE); 2362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalUpper == 0) { 2363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Pathological case. Attempt no matches, as if the block doesn't exist. 2364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setSize(topOfBlock); 2365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { 2369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The thing being repeated is not a single op, but some 2370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // more complex block. Do it as a loop, not inlines. 2371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note that things "repeated" a max of once are handled as inline, because 2372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the one copy of the code already generated is just fine. 2373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Pick up the opcode that is to be repeated 2377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 237850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); 2379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2380c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Compute the pattern location where the inline sequence 2381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // will end, and set up the state save op that will be needed. 2382c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 2384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru + fIntervalUpper + (fIntervalUpper-fIntervalLow); 2385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); 2386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalLow == 0) { 2387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topOfBlock); 2388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); 2389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop, emitting the op for the thing being repeated each time. 2394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop starts at 1 because one instance of the op already exists in the pattern, 2395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // it was put there when it was originally encountered. 2396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i; 2397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=1; i<fIntervalUpper; i++ ) { 2398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i == fIntervalLow) { 2399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 2400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (i > fIntervalLow) { 2402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 2403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->addElement(op, *fStatus); 2405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// matchStartType Determine how a match can start. 2414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Used to optimize find() operations. 2415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Operation is very similar to minMatchLength(). Walk the compiled 2417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// pattern, keeping an on-going minimum-match-length. For any 2418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// op where the min match coming in is zero, add that ops possible 2419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// starting matches to the possible starts for the overall pattern. 2420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::matchStartType() { 2423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 2424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; // Location in the pattern of the current op being processed. 2429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; // The op being processed 2430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType; // The opcode type of the op 2431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern 2432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t numInitialStrings = 0; // Number of strings encountered that could match at start. 2433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool atStart = TRUE; // True if no part of the pattern yet encountered 2435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // could have advanced the position in a match. 2436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (Maximum match length so far == 0) 2437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forwardedLength is a vector holding minimum-match-length values that 2439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // It must be one longer than the pattern being checked because some ops 2441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // will jmp to a end-of-block+1 location from within a block, and we must 2442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // count those when checking the block. 2443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end = fRXPat->fCompiledPat->size(); 2444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 forwardedLength(end+1, *fStatus); 2445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setSize(end+1); 2446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=3; loc<end; loc++) { 2447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(INT32_MAX, loc); 2448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc = 3; loc<end; loc++) { 245150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType = URX_TYPE(op); 2453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The loop is advancing linearly through the pattern. 2455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the op we are now at was the destination of a branch in the pattern, 2456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and that path has a shorter minimum length than the current accumulated value, 2457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // replace the current accumulated value. 2458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(loc) < currentLen) { 2459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc); 2460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (opType) { 2464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that don't change the total length matched 2465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP: 2466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END: 2467c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_FAIL: 2468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_LEN: 2469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_NOP: 2470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_START_CAPTURE: 2471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END_CAPTURE: 2472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_B: 2473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_BU: 2474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_G: 2475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_Z: 2476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR: 2477c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_M: 2478c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_D: 2479c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_MD: 2480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RELOC_OPRND: 2481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_INP_LOC: 2482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF_I: 248450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LD_SP: 2487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2488c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET: 2490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (atStart) { 2491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_START; 2492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET_M: 2496c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_CARET_M_UNIX: 2497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (atStart) { 2498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_LINE; 2499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2501c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR: 2503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This character could appear at the start of a match. 2505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Add it to the set of possible starting characters. 2506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->add(URX_VAL(op)); 2507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2513c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2514c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_SETREF: 2515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t sn = URX_VAL(op); 2517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(*s); 2520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_SR_I: 2527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // [Set]*, like a SETREF, above, in what it can match, 2528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // but may not match at all, so currentLen is not incremented. 2529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t sn = URX_VAL(op); 2531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(*s); 2534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_DOT_I: 2540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // .* at the start of a pattern. 2542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Any character can begin the match. 2543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->clear(); 2544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->complement(); 2545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2551c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_STATIC_SETREF: 2552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t sn = URX_VAL(op); 2554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(sn>0 && sn<URX_LAST_SET); 2555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(*s); 2557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2565c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_STAT_SETREF_N: 2566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t sn = URX_VAL(op); 2568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet sc(*s); 2570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sc.complement(); 2571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(sc); 2572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_D: 2581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Digit Char 2582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2583c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet s; 2584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 2585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_VAL(op) != 0) { 2586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru s.complement(); 2587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(s); 2589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR_I: 2597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Case Insensitive Single Character. 2598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = URX_VAL(op); 2600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // character may have distinct cased forms. Add all of them 2602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to the set of possible starting match chars. 2603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet s(c, c); 2604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru s.closeOver(USET_CASE_INSENSITIVE); 2605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(s); 2606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 2607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Char has no case variants. Just add it as-is to the 2608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // set of possible starting chars. 2609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->add(c); 2610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2618b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY_ALL: // . matches one or two. 2620b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY: 2621c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_UNIX: 2622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2623b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // These constructs are all bad news when they appear at the start 2624b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of a match. Any character can begin the match. 2625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->clear(); 2626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->complement(); 2627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMPX: 2635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. 2636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP: 2637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 2639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest < loc) { 2640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop of some kind. Can safely ignore, the worst that will happen 2641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is that we understate the true minimum length 2642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 2643c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 2645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Forward jump. Propagate the current min length to the target loc of the jump. 2646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(jmpDest <= end+1); 2647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(jmpDest) > currentLen) { 2648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 2649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV: 2656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV_X: 2657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Combo of state save to the next loc, + jmp backwards. 2658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Net effect on min. length computation is nothing. 2659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2662c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_BACKTRACK: 2663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Fails are kind of like a branch, except that the min length was 2664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated already, by the state save. 2665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 2666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATE_SAVE: 2671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // State Save, for forward jumps, propagate the current minimum. 2673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the state save. 2674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 2675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 2676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen < forwardedLength.elementAti(jmpDest)) { 2677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 2678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2679c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2683c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING: 2688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 269050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringLen = URX_VAL(stringLenOp); 2692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(stringLenOp >= 2); 2694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Add the starting character of this string to the set of possible starting 2696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // characters for this pattern. 2697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringStartIdx = URX_VAL(op); 2698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->add(c); 2700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Remember this string. After the entire pattern has been checked, 2702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if nothing else is identified that can start a match, we'll use it. 2703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings++; 2704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialStringIdx = stringStartIdx; 2705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialStringLen = stringLen; 2706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2707c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen += stringLen; 2709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_I: 2714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Case-insensitive string. Unlike exact-match strings, we won't 2716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // attempt a string search for possible match positions. But we 2717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // do update the set of possible starting characters. 2718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 271950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringLen = URX_VAL(stringLenOp); 2721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(stringLenOp >= 2); 2723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Add the starting character of this string to the set of possible starting 2725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // characters for this pattern. 2726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringStartIdx = URX_VAL(op); 2727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet s(c, c); 2729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru s.closeOver(USET_CASE_INSENSITIVE); 2730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(s); 2731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; // Matching on an initial string not possible. 2732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen += stringLen; 2734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT: 2739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT_NG: 2740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop Init Ops. These don't change the min length, but they are 4 word ops 2742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // so location must be updated accordingly. 2743c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Loop Init Ops. 2744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the min loop count == 0 2745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // move loc forwards to the end of the loop, skipping over the body. 2746c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If the min count is > 0, 2747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // continue normal processing of the body of the loop. 274850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 2749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopEndLoc = URX_VAL(loopEndLoc); 275050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 2751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (minLoopCount == 0) { 2752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Min Loop Count of 0, treat like a forward branch and 2753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // move the current minimum length up to the target 2754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (end of loop) location. 2755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(loopEndLoc <= end+1); 2756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(loopEndLoc) > currentLen) { 2757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, loopEndLoc); 2758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2759c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc+=3; // Skips over operands of CTR_INIT 2761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP: 2767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP_NG: 2768c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Loop ops. 2769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The jump is conditional, backwards only. 2770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2772c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_C: 2774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // More loop ops. These state-save to themselves. 2775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // don't change the minimum match 2776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2778c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_START: 2781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_START: 2782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-around. Scan forward until the matching look-ahead end, 2784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // without processing the look-around block. This is overly pessimistic. 2785c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2786c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Keep track of the nesting depth of look-around blocks. Boilerplate code for 2787c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // lookahead contains two LA_END instructions, so count goes up by two 2788c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // for each LA_START. 2789c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t depth = (opType == URX_LA_START? 2: 1); 2790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 2791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 279250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2793c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_START) { 2794c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru depth+=2; 2795c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2796c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LB_START) { 2797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth++; 2798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 2800c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru depth--; 2801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (depth == 0) { 2802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_STATE_SAVE) { 2806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Need this because neg lookahead blocks will FAIL to outside 2807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the block. 2808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 2809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 2810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen < forwardedLength.elementAti(jmpDest)) { 2811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 2812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2815c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(loc <= end); 2816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2819c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_END: 2821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_CONT: 2822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_END: 2823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_CONT: 2824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_END: 2825c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(FALSE); // Shouldn't get here. These ops should be 2826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // consumed by the scan in URX_LA_START and LB_START 2827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2829c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 2831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 2832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2833c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We have finished walking through the ops. Check whether some forward jump 2838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated a shorter length to location end+1. 2839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(end+1) < currentLen) { 2840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(end+1); 2841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars8->init(fRXPat->fInitialChars); 2845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Sort out what we should check for when looking for candidate match start positions. 2848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // In order of preference, 2849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. Start of input text buffer. 2850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. A literal string. 2851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. Start of line in multi-line mode. 2852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. A single literal character. 2853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. A character from a set of characters. 2854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fRXPat->fStartType == START_START) { 2856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Match only at the start of an input text string. 2857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // start type is already set. We're done. 2858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { 2859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Match beginning only with a literal string. 2860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); 2861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fRXPat->fInitialChars->contains(c)); 2862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_STRING; 2863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChar = c; 2864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (fRXPat->fStartType == START_LINE) { 2865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Match at start of line in Multi-Line mode. 2866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Nothing to do here; everything is already set. 2867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (fRXPat->fMinMatchLen == 0) { 2868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Zero length match possible. We could start anywhere. 2869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_NO_INFO; 2870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (fRXPat->fInitialChars->size() == 1) { 2871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // All matches begin with the same char. 2872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_CHAR; 2873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); 2874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); 2875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE && 2876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fMinMatchLen > 0) { 2877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Matches start with a set of character smaller than the set of all chars. 2878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_SET; 2879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 2880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Matches can start with anything 2881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_NO_INFO; 2882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// minMatchLength Calculate the length of the shortest string that could 2892c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// match the specified pattern. 2893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Length is in 16 bit code units, not code points. 2894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The calculated length may not be exact. The returned 2896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// value may be shorter than the actual minimum; it must 2897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// never be longer. 2898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// start and end are the range of p-code operations to be 2900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// examined. The endpoints are included in the range. 2901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { 2904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 2905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 2906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(start <= end); 2909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(end < fRXPat->fCompiledPat->size()); 2910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; 2913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; 2914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType; 2915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t currentLen = 0; 2916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forwardedLength is a vector holding minimum-match-length values that 2919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // It must be one longer than the pattern being checked because some ops 2921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // will jmp to a end-of-block+1 location from within a block, and we must 2922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // count those when checking the block. 2923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 forwardedLength(end+2, *fStatus); 2924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setSize(end+2); 2925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=start; loc<=end+1; loc++) { 2926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(INT32_MAX, loc); 2927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc = start; loc<=end; loc++) { 293050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType = URX_TYPE(op); 2932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The loop is advancing linearly through the pattern. 2934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the op we are now at was the destination of a branch in the pattern, 2935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and that path has a shorter minimum length than the current accumulated value, 2936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // replace the current accumulated value. 2937c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some 2938c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // no-match-possible cases. 2939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(loc) < currentLen) { 2940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc); 2941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (opType) { 2945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that don't change the total length matched 2946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP: 2947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END: 2948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_LEN: 2949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_NOP: 2950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_START_CAPTURE: 2951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END_CAPTURE: 2952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_B: 2953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_BU: 2954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_G: 2955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_Z: 2956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET: 2957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR: 2958c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_M: 2959c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_D: 2960c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_MD: 2961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RELOC_OPRND: 2962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_INP_LOC: 2963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET_M: 2964c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_CARET_M_UNIX: 2965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF_I: 2967b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LD_SP: 2970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV: 2972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV_X: 2973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2974c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that match a minimum of one character (one or two 16 bit code units.) 2977c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR: 2979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATIC_SETREF: 2980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STAT_SETREF_N: 2981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_SETREF: 2982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_D: 2983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR_I: 2984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY_ALL: // . matches one or two. 2986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY: 2987c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_UNIX: 2988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMPX: 2993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; // URX_JMPX has an extra operand, ignored here, 2994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // otherwise processed identically to URX_JMP. 2995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP: 2996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 2998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest < loc) { 2999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop of some kind. Can safely ignore, the worst that will happen 3000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is that we understate the true minimum length 3001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 3002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 3003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Forward jump. Propagate the current min length to the target loc of the jump. 3004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(jmpDest <= end+1); 3005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(jmpDest) > currentLen) { 3006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3012c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_BACKTRACK: 3013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3014c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Back-tracks are kind of like a branch, except that the min length was 3015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated already, by the state save. 3016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 3017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATE_SAVE: 3022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // State Save, for forward jumps, propagate the current minimum. 3024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the state save. 3025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 3027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen < forwardedLength.elementAti(jmpDest)) { 3028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3030c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3033c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING: 3036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_I: 3037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 303950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen += URX_VAL(stringLenOp); 3041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT: 3046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT_NG: 3047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3048c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Loop Init Ops. 3049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the min loop count == 0 3050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // move loc forwards to the end of the loop, skipping over the body. 3051c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If the min count is > 0, 3052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // continue normal processing of the body of the loop. 305350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 3054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopEndLoc = URX_VAL(loopEndLoc); 305550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 3056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (minLoopCount == 0) { 3057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc = loopEndLoc; 3058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 3059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc+=3; // Skips over operands of CTR_INIT 3060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3063b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP: 3066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP_NG: 3067c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Loop ops. 3068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The jump is conditional, backwards only. 3069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3070c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_SR_I: 3072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_DOT_I: 3073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_C: 3074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // More loop ops. These state-save to themselves. 3075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // don't change the minimum match - could match nothing at all. 3076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3077c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_START: 3080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_START: 3081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-around. Scan forward until the matching look-ahead end, 3083c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // without processing the look-around block. This is overly pessimistic for look-ahead, 3084c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // it assumes that the look-ahead match might be zero-length. 3085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: Positive lookahead could recursively do the block, then continue 3086c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // with the longer of the block or the value coming in. Ticket 6060 3087c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t depth = (opType == URX_LA_START? 2: 1);; 3088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 309050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3091c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_START) { 3092c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The boilerplate for look-ahead includes two LA_END insturctions, 3093c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Depth will be decremented by each one when it is seen. 3094c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru depth += 2; 3095c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3096c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LB_START) { 3097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth++; 3098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3099c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_END) { 3100c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru depth--; 3101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (depth == 0) { 3102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3104c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3105c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op)==URX_LBN_END) { 3106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth--; 3107c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (depth == 0) { 3108c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 3109c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_STATE_SAVE) { 3112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Need this because neg lookahead blocks will FAIL to outside 3113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the block. 3114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 3116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen < forwardedLength.elementAti(jmpDest)) { 3117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(loc <= end); 3122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_END: 3127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_CONT: 3128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_END: 3129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_CONT: 3130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_END: 3131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Only come here if the matching URX_LA_START or URX_LB_START was not in the 3132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // range being sized, which happens when measuring size of look-behind blocks. 3133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 3136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 3137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We have finished walking through the ops. Check whether some forward jump 3142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated a shorter length to location end+1. 3143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(end+1) < currentLen) { 3144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(end+1); 3145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3147c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return currentLen; 3149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// maxMatchLength Calculate the length of the longest string that could 3156c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// match the specified pattern. 3157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Length is in 16 bit code units, not code points. 3158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The calculated length may not be exact. The returned 3160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// value may be longer than the actual maximum; it must 3161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// never be shorter. 3162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { 3165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 3166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 3167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(start <= end); 3169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(end < fRXPat->fCompiledPat->size()); 3170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; 3173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; 3174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType; 3175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t currentLen = 0; 3176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 forwardedLength(end+1, *fStatus); 3177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setSize(end+1); 3178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=start; loc<=end; loc++) { 3180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(0, loc); 3181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc = start; loc<=end; loc++) { 318450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType = URX_TYPE(op); 3186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The loop is advancing linearly through the pattern. 3188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the op we are now at was the destination of a branch in the pattern, 3189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and that path has a longer maximum length than the current accumulated value, 3190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // replace the current accumulated value. 3191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(loc) > currentLen) { 3192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc); 3193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (opType) { 3196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that don't change the total length matched 3197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP: 3198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END: 3199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_LEN: 3200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_NOP: 3201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_START_CAPTURE: 3202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END_CAPTURE: 3203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_B: 3204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_BU: 3205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_G: 3206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_Z: 3207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET: 3208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR: 3209c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_M: 3210c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_D: 3211c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_MD: 3212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RELOC_OPRND: 3213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_INP_LOC: 3214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET_M: 3215c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_CARET_M_UNIX: 3216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LD_SP: 3219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_END: 3221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_CONT: 3222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_CONT: 3223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_END: 3224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3225c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that increase that cause an unbounded increase in the length 3228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of a matched string, or that increase it a hard to characterize way. 3229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Call the max length unbounded, and stop further checking. 3230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF_I: 3232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 3233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = INT32_MAX; 3234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that match a max of one character (possibly two 16 bit code units.) 3238c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATIC_SETREF: 3240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STAT_SETREF_N: 3241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_SETREF: 3242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_D: 3243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR_I: 3244c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_ALL: 3245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY: 3246c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_UNIX: 3247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen+=2; 3248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Single literal character. Increase current max length by one or two, 3251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // depending on whether the char is in the supplementary range. 3252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR: 3253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 3254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_VAL(op) > 0x10000) { 3255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 3256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3259c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Jumps. 3260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP: 3262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMPX: 3263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV: 3264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV_X: 3265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest < loc) { 3268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop of some kind. Max match length is unbounded. 3269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = INT32_MAX; 3270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 3271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Forward jump. Propagate the current min length to the target loc of the jump. 3272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(jmpDest) < currentLen) { 3273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = 0; 3276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3280c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_BACKTRACK: 3281c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // back-tracks are kind of like a branch, except that the max length was 3282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated already, by the state save. 3283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 3284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATE_SAVE: 3288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // State Save, for forward jumps, propagate the current minimum. 3290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the state save. 3291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // For backwards jumps, they create a loop, maximum 3292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // match length is unbounded. 3293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 3295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen > forwardedLength.elementAti(jmpDest)) { 3296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 3299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = INT32_MAX; 3300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3303c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING: 3308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_I: 3309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 331150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen += URX_VAL(stringLenOp); 3313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT: 3318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT_NG: 3319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP: 3320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP_NG: 3321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_SR_I: 3322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_DOT_I: 3323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_C: 3324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // For anything to do with loops, make the match length unbounded. 3325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note: INIT instructions are multi-word. Can ignore because 3326b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // INT32_MAX length will stop the per-instruction loop. 3327b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = INT32_MAX; 3328b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3329c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3330c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_START: 3333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_END: 3334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-ahead. Just ignore, treat the look-ahead block as if 3335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // it were normal pattern. Gives a too-long match length, 3336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // but good enough for now. 3337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3338c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // End of look-ahead ops should always be consumed by the processing at 3340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the URX_LA_START op. 3341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // U_ASSERT(FALSE); 3342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // break; 3343c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_START: 3345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-behind. Scan forward until the matching look-around end, 3347c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // without processing the look-behind block. 3348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t depth = 0; 3349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 335150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { 3353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth++; 3354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 3356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (depth == 0) { 3357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth--; 3360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3361c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(loc < end); 3362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 3367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 3368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3370c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == INT32_MAX) { 3372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The maximum length is unbounded. 3373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Stop further processing of the pattern. 3374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3376c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return currentLen; 3379c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// stripNOPs Remove any NOP operations from the compiled pattern code. 3386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Extra NOPs are inserted for some constructs during the initial 3387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// code generation to provide locations that may be patched later. 3388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Many end up unneeded, and are removed by this function. 3389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 339050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// In order to minimize the number of passes through the pattern, 339150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// back-reference fixup is also performed here (adjusting 339250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// back-reference operands to point to the correct frame offsets). 339350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// 339450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// In addition, case-insensitive character and string literals are 339550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// now case-folded here, rather than when first parsed or at match 339650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// time. 339750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// 3398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::stripNOPs() { 3400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 3402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end = fRXPat->fCompiledPat->size(); 3406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 deltas(end, *fStatus); 3407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Make a first pass over the code, computing the amount that things 3409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // will be offset at each location in the original code. 3410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; 3411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t d = 0; 3412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=0; loc<end; loc++) { 3413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deltas.addElement(d, *fStatus); 341450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_NOP) { 3416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru d++; 3417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 341950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 342050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString caseStringBuffer; 342150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stringDelta = 0; 3422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Make a second pass over the code, removing the NOPs by moving following 3424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // code up, and patching operands that refer to code locations that 3425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are being moved. The array of offsets from the first step is used 3426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to compute the new operand values. 3427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t src; 3428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dst = 0; 3429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (src=0; src<end; src++) { 343050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); 3431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType = URX_TYPE(op); 3432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (opType) { 3433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_NOP: 3434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATE_SAVE: 3437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP: 3438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP: 3439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP_NG: 3440b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RELOC_OPRND: 3441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMPX: 3442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV: 3443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV_X: 3444b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // These are instructions with operands that refer to code locations. 3445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t operandAddress = URX_VAL(op); 3447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); 3448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress); 3449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru op = URX_BUILD(opType, fixedOperandAddress); 3450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, dst); 3451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru dst++; 3452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 345550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case URX_ONECHAR_I: 345650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { 345750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c = URX_VAL(op); 345850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 345950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // We have a cased character to fold 346050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c = u_foldCase(c, U_FOLD_CASE_DEFAULT); 346150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = URX_BUILD(URX_ONECHAR_I, c); 346250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 346350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 346450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fCompiledPat->setElementAt(op, dst); 346550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dst++; 346650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 346750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 346850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case URX_STRING_I: 346950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { 347050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = URX_BUILD(URX_STRING_I, URX_VAL(op)+stringDelta); 347150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 347250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho src++; 347350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t lengthOp = (int32_t)fRXPat->fCompiledPat->elementAti(src); 347450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 347550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho caseStringBuffer.setTo(fRXPat->fLiteralText, URX_VAL(op), URX_VAL(lengthOp)); 347650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho caseStringBuffer.foldCase(U_FOLD_CASE_DEFAULT); 347750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 347850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t newLen = caseStringBuffer.length(); 347950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (newLen <= URX_VAL(lengthOp)) { 348050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // don't shift if we don't have to, take the tiny memory hit of a smaller string 348150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fLiteralText.replace(URX_VAL(op), newLen, caseStringBuffer); 348250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 348350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // shift other strings over...at least UnicodeString handles this for us! 348450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fLiteralText.replace(URX_VAL(op), URX_VAL(lengthOp), caseStringBuffer); 348550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho stringDelta += newLen - URX_VAL(lengthOp); 348650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 348750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho lengthOp = URX_BUILD(URX_STRING_LEN, newLen); 348850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 348950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fCompiledPat->setElementAt(op, dst); 349050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fCompiledPat->setElementAt(lengthOp, dst+1); 349150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dst += 2; 349250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 349350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 349450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case URX_BACKREF: 349550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case URX_BACKREF_I: 349650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { 349750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t where = URX_VAL(op); 349850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (where > fRXPat->fGroupMap->size()) { 349950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho error(U_REGEX_INVALID_BACK_REF); 350050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 350150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 350250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho where = fRXPat->fGroupMap->elementAti(where-1); 350350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = URX_BUILD(opType, where); 350450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fCompiledPat->setElementAt(op, dst); 350550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dst++; 350650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 350750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fNeedsAltInput = TRUE; 350850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 350950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 351050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case URX_STRING: 351150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = URX_BUILD(URX_STRING, URX_VAL(op)+stringDelta); 351250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // continue 3513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP: 3514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP_N: 3515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKTRACK: 3516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END: 3517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR: 3518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_LEN: 3519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_START_CAPTURE: 3520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END_CAPTURE: 3521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATIC_SETREF: 3522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STAT_SETREF_N: 3523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_SETREF: 3524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY: 3525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_FAIL: 3526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_B: 3527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_BU: 3528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_G: 3529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_X: 3530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_Z: 3531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY_ALL: 3532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_D: 3533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET: 3534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR: 3535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT: 3536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT_NG: 3537c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_UNIX: 3538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_SP: 3539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LD_SP: 3540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_INP_LOC: 3541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_START: 3542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_END: 3543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR_M: 3544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET_M: 3545c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_CARET_M_UNIX: 3546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_START: 3547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_CONT: 3548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_END: 3549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_CONT: 3550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_END: 3551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_SR_I: 3552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_DOT_I: 3553b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_C: 3554c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_D: 3555c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_MD: 3556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // These instructions are unaltered by the relocation. 3557b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, dst); 3558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru dst++; 3559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 3562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Some op is unaccounted for. 3563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 3564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_INTERNAL_ERROR); 3565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setSize(dst); 3569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Error Report a rule parse error. 3577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Only report it if no previous error has been recorded. 3578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3579b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::error(UErrorCode e) { 3581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*fStatus)) { 3582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *fStatus = e; 358350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public 358450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are 358550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // int64_t. If the values of the latter are out of range for the former, 358650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // set them to the appropriate "field not supported" values. 358750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (fLineNum > 0x7FFFFFFF) { 358850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->line = 0; 358950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->offset = -1; 359050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if (fCharNum > 0x7FFFFFFF) { 359150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->line = (int32_t)fLineNum; 359250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->offset = -1; 359350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 359450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->line = (int32_t)fLineNum; 359550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->offset = (int32_t)fCharNum; 359650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 359750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 359850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context 3599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Fill in the context. 3601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note: extractBetween() pins supplied indicies to the string bounds. 3602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); 3603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); 360450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); 360550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); 3606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3607b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3608b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3609b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3611b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Assorted Unicode character constants. 3612b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Numeric because there is no portable way to enter them as literals. 3613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// (Think EBCDIC). 3614b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3615b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic const UChar chCR = 0x0d; // New lines, for terminating comments. 3616c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chLF = 0x0a; // Line Feed 3617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic const UChar chPound = 0x23; // '#', introduces a comment. 3618c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chDigit0 = 0x30; // '0' 3619c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chDigit7 = 0x37; // '9' 3620c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chColon = 0x3A; // ':' 3621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic const UChar chE = 0x45; // 'E' 3622c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chQ = 0x51; // 'Q' 3623c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chN = 0x4E; // 'N' 3624c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chP = 0x50; // 'P' 3625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic const UChar chBackSlash = 0x5c; // '\' introduces a char escape 3626c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chLBracket = 0x5b; // '[' 3627c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chRBracket = 0x5d; // ']' 3628c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chUp = 0x5e; // '^' 3629c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chLowerP = 0x70; 3630c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chLBrace = 0x7b; // '{' 3631c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chRBrace = 0x7d; // '}' 3632c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chNEL = 0x85; // NEL newline variant 3633c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chLS = 0x2028; // Unicode Line Separator 3634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// nextCharLL Low Level Next Char from the regex pattern. 3639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Get a char from the string, keep track of input position 3640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// for error reporting. 3641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUChar32 RegexCompile::nextCharLL() { 3644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 ch; 3645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fPeekChar != -1) { 3647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ch = fPeekChar; 3648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPeekChar = -1; 3649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return ch; 3650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 365150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 365250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // assume we're already in the right place 365350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ch = UTEXT_NEXT32(fRXPat->fPattern); 365450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (ch == U_SENTINEL) { 365550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return ch; 3656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ch == chCR || 3659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ch == chNEL || 3660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ch == chLS || 366127f654740f2a26ad62a5c155af9199af9e69b889claireho (ch == chLF && fLastChar != chCR)) { 3662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Character is starting a new line. Bump up the line number, and 3663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // reset the column to 0. 3664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fLineNum++; 3665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fCharNum=0; 3666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Character is not starting a new line. Except in the case of a 3669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LF following a CR, increment the column position. 3670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ch != chLF) { 3671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fCharNum++; 3672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fLastChar = ch; 3675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return ch; 3676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// peekCharLL Low Level Character Scanning, sneak a peek at the next 3681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// character without actually getting it. 3682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUChar32 RegexCompile::peekCharLL() { 3685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fPeekChar == -1) { 3686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPeekChar = nextCharLL(); 3687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fPeekChar; 3689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// nextChar for pattern scanning. At this level, we handle stripping 3695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// out comments and processing some backslash character escapes. 3696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The rest of the pattern grammar is handled at the next level up. 3697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::nextChar(RegexPatternChar &c) { 3700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 370150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar = nextCharLL(); 3703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fQuoted = FALSE; 3704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fQuoteMode) { 3706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fQuoted = TRUE; 3707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((c.fChar==chBackSlash && peekCharLL()==chE) || c.fChar == (UChar32)-1) { 3708b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fQuoteMode = FALSE; // Exit quote mode, 3709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextCharLL(); // discard the E 3710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextChar(c); // recurse to get the real next char 3711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else if (fInBackslashQuote) { 3714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The current character immediately follows a '\' 3715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Don't check for any further escapes, just return it as-is. 3716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Don't set c.fQuoted, because that would prevent the state machine from 3717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // dispatching on the character. 3718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fInBackslashQuote = FALSE; 3719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else 3721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We are not in a \Q quoted region \E of the source. 3723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_COMMENTS) { 3725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We are in free-spacing and comments mode. 3727c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scan through any white space and comments, until we 3728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // reach a significant character or the end of inut. 3729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c.fChar == (UChar32)-1) { 3731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; // End of Input 3732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c.fChar == chPound && fEOLComments == TRUE) { 3734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Start of a comment. Consume the rest of it, until EOF or a new line 3735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar = nextCharLL(); 3737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c.fChar == (UChar32)-1 || // EOF 3738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar == chCR || 3739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar == chLF || 3740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar == chNEL || 3741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar == chLS) { 3742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3746c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. 3747b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (PatternProps::isWhiteSpace(c.fChar) == FALSE) { 3748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar = nextCharLL(); 3751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check for backslash escaped characters. 3756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c.fChar == chBackSlash) { 375850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { 3760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // A '\' sequence that is handled by ICU's standard unescapeAt function. 3762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Includes \uxxxx, \n, \r, many others. 3763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Return the single equivalent character. 3764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextCharLL(); // get & discard the peeked char. 3766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fQuoted = TRUE; 376750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 376850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { 376950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t endIndex = (int32_t)pos; 377050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); 377150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 377250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (endIndex == pos) { 377350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho error(U_REGEX_BAD_ESCAPE_SEQUENCE); 377450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 377550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fCharNum += endIndex - pos; 377650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); 377750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 377850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t offset = 0; 377950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); 378050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 378150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); 378250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 378350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 378450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (offset == 0) { 378550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho error(U_REGEX_BAD_ESCAPE_SEQUENCE); 378650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if (context.lastOffset == offset) { 378750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTEXT_PREVIOUS32(fRXPat->fPattern); 378850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if (context.lastOffset != offset-1) { 378950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); 379050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 379150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fCharNum += offset; 3792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3794c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (peekCharLL() == chDigit0) { 3795c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Octal Escape, using Java Regexp Conventions 3796c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // which are \0 followed by 1-3 octal digits. 3797c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Different from ICU Unescape handling of Octal, which does not 3798c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // require the leading 0. 379950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Java also has the convention of only consuming 2 octal digits if 3800c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the three digit number would be > 0xff 3801c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3802c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c.fChar = 0; 3803c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextCharLL(); // Consume the initial 0. 3804c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int index; 3805c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (index=0; index<3; index++) { 3806c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t ch = peekCharLL(); 3807c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (ch<chDigit0 || ch>chDigit7) { 3808c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index==0) { 3809c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // \0 is not followed by any octal digits. 3810c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3811c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3812c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 3813c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3814c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c.fChar <<= 3; 3815c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c.fChar += ch&7; 3816c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (c.fChar <= 255) { 3817c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextCharLL(); 3818c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 3819c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The last digit made the number too big. Forget we saw it. 3820c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c.fChar >>= 3; 3821c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3822c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3823c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c.fQuoted = TRUE; 3824c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3825c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (peekCharLL() == chQ) { 3826c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // "\Q" enter quote mode, which will continue until "\E" 3827c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fQuoteMode = TRUE; 3828c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextCharLL(); // discard the 'Q'. 3829c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(c); // recurse to get the real next char. 3830c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else 3832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We are in a '\' escape that will be handled by the state table scanner. 3834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Just return the backslash, but remember that the following char is to 3835c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // be taken literally. 3836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fInBackslashQuote = TRUE; 3837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // re-enable # to end-of-line comments, in case they were disabled. 3842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // They are disabled by the parser upon seeing '(?', but this lasts for 3843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the fetching of the next character only. 3844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fEOLComments = TRUE; 3845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // putc(c.fChar, stdout); 3847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3853c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// scanNamedChar 3854c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. 3855c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3856c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// The scan position will be at the 'N'. On return 3857c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// the scan position should be just after the '}' 3858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3859c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Return the UChar32 3860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3862c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUChar32 RegexCompile::scanNamedChar() { 3863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 3864c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 3865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3867c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 3868c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar != chLBrace) { 3869c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 3870c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 3871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3872c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3873c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString charName; 3874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3875c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 3876c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == chRBrace) { 3877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3879c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == -1) { 3880c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 3881c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 3882c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3883c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru charName.append(fC.fChar); 3884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3885c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3886c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru char name[100]; 3887c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || 3888c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (uint32_t)charName.length()>=sizeof(name)) { 3889c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // All Unicode character names have only invariant characters. 3890c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The API to get a character, given a name, accepts only char *, forcing us to convert, 3891c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // which requires this error check 3892c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 3893c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 3894c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3895c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru charName.extract(0, charName.length(), name, sizeof(name), US_INV); 3896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3897c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); 3898c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 3899c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 3900c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3902c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); // Continue overall regex pattern processing with char after the '}' 3903c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return theChar; 3904c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// scanProp Construct a UnicodeSet from the text at the current scan 3909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// position, which will be of the form \p{whaterver} 3910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The scan position will be at the 'p' or 'P'. On return 3912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// the scan position should be just after the '}' 3913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Return a UnicodeSet, constructed from the \P pattern, 3915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// or NULL if the pattern is invalid. 3916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeSet *RegexCompile::scanProp() { 3919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *uset = NULL; 3920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 3922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 3923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3924c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); 3925c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool negated = (fC.fChar == chP); 3926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3927c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString propertyName; 3928c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 3929c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar != chLBrace) { 3930c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 3931c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 3932c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3934c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 3935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fC.fChar == chRBrace) { 3936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fC.fChar == -1) { 3939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Hit the end of the input string without finding the closing '}' 3940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 3941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 3942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3943c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru propertyName.append(fC.fChar); 3944c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3945c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uset = createSetForProperty(propertyName, negated); 3946c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); // Move input scan to position following the closing '}' 3947c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return uset; 3948c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 3949c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3950c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//------------------------------------------------------------------------------ 3951c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3952c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// scanPosixProp Construct a UnicodeSet from the text at the current scan 3953c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// position, which is expected be of the form [:property expression:] 3954c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3955c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// The scan position will be at the opening ':'. On return 3956c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// the scan position must be on the closing ']' 3957c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3958c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Return a UnicodeSet constructed from the pattern, 3959c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// or NULL if this is not a valid POSIX-style set expression. 3960c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// If not a property expression, restore the initial scan position 3961c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// (to the opening ':') 3962c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3963c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Note: the opening '[:' is not sufficient to guarantee that 3964c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// this is a [:property:] expression. 3965c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// [:'+=,] is a perfectly good ordinary set expression that 3966c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// happens to include ':' as one of its characters. 3967c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 3968c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//------------------------------------------------------------------------------ 3969c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUnicodeSet *RegexCompile::scanPosixProp() { 3970c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *uset = NULL; 3971c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3972c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 3973c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 3974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3976c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fC.fChar == chColon); 3977c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3978c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Save the scanner state. 3979c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 398050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t savedScanIndex = fScanIndex; 398150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3982c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool savedQuoteMode = fQuoteMode; 3983c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool savedInBackslashQuote = fInBackslashQuote; 3984c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool savedEOLComments = fEOLComments; 398550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t savedLineNum = fLineNum; 398650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t savedCharNum = fCharNum; 3987c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 savedLastChar = fLastChar; 3988c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 savedPeekChar = fPeekChar; 3989c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru RegexPatternChar savedfC = fC; 3990c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3991c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scan for a closing ]. A little tricky because there are some perverse 3992c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, 3993c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // ending on the second closing ]. 3994c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3995c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString propName; 3996c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool negated = FALSE; 3997c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3998c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] 3999c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4000c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == chUp) { 4001c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru negated = TRUE; 4002c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4003c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4004c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4005c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scan for the closing ":]", collecting the property name along the way. 4006c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool sawPropSetTerminator = FALSE; 4007c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (;;) { 4008c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru propName.append(fC.fChar); 4009c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4010c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fQuoted || fC.fChar == -1) { 4011c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Escaped characters or end of input - either says this isn't a [:Property:] 4012c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4013c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4014c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == chColon) { 4015c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4016c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == chRBracket) { 4017c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru sawPropSetTerminator = TRUE; 4018c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4019c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4020c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4021c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4022c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4023c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (sawPropSetTerminator) { 4024c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uset = createSetForProperty(propName, negated); 4025c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4026c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else 4027c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 4028c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No closing ":]". 4029c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Restore the original scan position. 4030c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The main scanner will retry the input as a normal set expression, 4031c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // not a [:Property:] expression. 4032c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fScanIndex = savedScanIndex; 4033c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fQuoteMode = savedQuoteMode; 4034c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fInBackslashQuote = savedInBackslashQuote; 4035c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fEOLComments = savedEOLComments; 4036c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLineNum = savedLineNum; 4037c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fCharNum = savedCharNum; 4038c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastChar = savedLastChar; 4039c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fPeekChar = savedPeekChar; 4040c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fC = savedfC; 404150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); 4042c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4043c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return uset; 4044c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4045c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4046c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { 4047c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); 4048c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_CF_MASK, ec); 4049c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4050c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4051c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4052c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Create a Unicode Set from a Unicode Property expression. 4053c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// This is common code underlying both \p{...} ane [:...:] expressions. 4054c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Includes trying the Java "properties" that aren't supported as 4055c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// normal ICU UnicodeSet properties 4056c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4057c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" 4058c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" 4059c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { 4060c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString setExpr; 4061c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set; 4062c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint32_t usetFlags = 0; 4063c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4064c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 4065c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 4066c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4067c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4068c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4069c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // First try the property as we received it 4070c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4071c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (negated) { 4072c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(negSetPrefix, -1); 4073c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4074c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(posSetPrefix, -1); 4075c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4076c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(propName); 4077c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(chRBrace); 4078c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(chRBracket); 4079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 4080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usetFlags |= USET_CASE_INSENSITIVE; 4081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4082c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4083c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_SUCCESS(*fStatus)) { 4084c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return set; 4085c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4086c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete set; 4087c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = NULL; 4088c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4089c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4090c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The property as it was didn't work. 409127f654740f2a26ad62a5c155af9199af9e69b889claireho 409227f654740f2a26ad62a5c155af9199af9e69b889claireho // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX 409327f654740f2a26ad62a5c155af9199af9e69b889claireho // or standard Java, but many other regular expression packages do recognize it. 409427f654740f2a26ad62a5c155af9199af9e69b889claireho 409527f654740f2a26ad62a5c155af9199af9e69b889claireho if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { 409627f654740f2a26ad62a5c155af9199af9e69b889claireho *fStatus = U_ZERO_ERROR; 409727f654740f2a26ad62a5c155af9199af9e69b889claireho set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); 409827f654740f2a26ad62a5c155af9199af9e69b889claireho if (set == NULL) { 409927f654740f2a26ad62a5c155af9199af9e69b889claireho *fStatus = U_MEMORY_ALLOCATION_ERROR; 410027f654740f2a26ad62a5c155af9199af9e69b889claireho return set; 410127f654740f2a26ad62a5c155af9199af9e69b889claireho } 410227f654740f2a26ad62a5c155af9199af9e69b889claireho if (negated) { 410327f654740f2a26ad62a5c155af9199af9e69b889claireho set->complement(); 410427f654740f2a26ad62a5c155af9199af9e69b889claireho } 410527f654740f2a26ad62a5c155af9199af9e69b889claireho return set; 410627f654740f2a26ad62a5c155af9199af9e69b889claireho } 410727f654740f2a26ad62a5c155af9199af9e69b889claireho 410827f654740f2a26ad62a5c155af9199af9e69b889claireho 410927f654740f2a26ad62a5c155af9199af9e69b889claireho // Do Java fixes - 4110c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // InGreek -> InGreek or Coptic, that being the official Unicode name for that block. 4111c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols. 4112c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4113c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols" 4114c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // is accepted by Java. The property part of the name is compared 4115c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // case-insenstively. The spaces must be exactly as shown, either 4116c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // all there, or all omitted, with exactly one at each position 4117c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // if they are present. From checking against JDK 1.6 4118c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4119c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This code should be removed when ICU properties support the Java compatibility names 4120c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // (ICU 4.0?) 4121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4122c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString mPropName = propName; 4123c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { 4124c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); 4125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4126c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 || 4127c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) { 4128c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols"); 4129c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4130c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4131c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); 4132c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4133c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // See if the property looks like a Java "InBlockName", which 4135c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // we will recast as "Block=BlockName" 4136c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4137c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru static const UChar IN[] = {0x49, 0x6E, 0}; // "In" 4138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block=" 4139c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mPropName.startsWith(IN, 2) && propName.length()>=3) { 4140c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.truncate(4); // Leaves "[\p{", or "[\P{" 4141c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(BLOCK, -1); 4142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed. 4143c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(chRBrace); 4144c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(chRBracket); 4145c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *fStatus = U_ZERO_ERROR; 4146c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4147c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_SUCCESS(*fStatus)) { 4148c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return set; 4149c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4150c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete set; 4151c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = NULL; 4152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4154c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) || 4155c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru propName.compare(UNICODE_STRING_SIMPLE("all")) == 0) 4156c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 4157c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode localStatus = U_ZERO_ERROR; 4158c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru //setExpr.remove(); 4159c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = new UnicodeSet(); 4160c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4161c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Try the various Java specific properties. 4162c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // These all begin with "java" 4163c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4164c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) { 4165c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_CN_MASK, localStatus); 4166c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->complement(); 4167c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4168c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) { 4169c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, localStatus); 4170c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4171c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) { 4172c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addIdentifierIgnorable(set, localStatus); 4173c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4174c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) { 4175c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0, 0x1F).add(0x7F, 0x9F); 4176c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4177c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) { 4178c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4179c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_SC_MASK, localStatus); 4180c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_PC_MASK, localStatus); 4181c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, localStatus); 4182c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_NL_MASK, localStatus); 4183c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_MC_MASK, localStatus); 4184c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_MN_MASK, localStatus); 4185c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addIdentifierIgnorable(set, localStatus); 4186c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4187c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) { 4188c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4189c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_NL_MASK, localStatus); 4190c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_SC_MASK, localStatus); 4191c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_PC_MASK, localStatus); 4192c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4193c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) { 4194c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4195c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4196c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) { 4197c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4198c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, localStatus); 4199c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4200c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) { 4201c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_LL_MASK, localStatus); 4202c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4203c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) { 4204c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus); 4205c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4206c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) { 4207c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_Z_MASK, localStatus); 4208c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4209c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) { 4210c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0x10000, UnicodeSet::MAX_VALUE); 4211c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4212c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) { 4213c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_LT_MASK, localStatus); 4214c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4215c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) { 4216c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4217c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_NL_MASK, localStatus); 4218c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4219c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) { 4220c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4221c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_PC_MASK, localStatus); 4222c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, localStatus); 4223c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_NL_MASK, localStatus); 4224c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_MC_MASK, localStatus); 4225c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_MN_MASK, localStatus); 4226c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addIdentifierIgnorable(set, localStatus); 4227c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4228c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) { 4229c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_LU_MASK, localStatus); 4230c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4231c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) { 4232c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0, UnicodeSet::MAX_VALUE); 4233c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4234c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) { 4235c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_Z_MASK, localStatus); 4236c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); 4237c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(9, 0x0d).add(0x1c, 0x1f); 4238c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4239c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4240c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0, UnicodeSet::MAX_VALUE); 4241c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4242c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4243c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_SUCCESS(localStatus) && !set->isEmpty()) { 4244c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *fStatus = U_ZERO_ERROR; 4245c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usetFlags & USET_CASE_INSENSITIVE) { 4246c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->closeOver(USET_CASE_INSENSITIVE); 4247c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4248c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (negated) { 4249c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->complement(); 4250c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4251c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return set; 4252c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4253c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete set; 4254c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = NULL; 4255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4256c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(*fStatus); 4257c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 4258c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4260c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4261c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4262c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4263c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// SetEval Part of the evaluation of [set expressions]. 4264c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Perform any pending (stacked) operations with precedence 4265c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// equal or greater to that of the next operator encountered 4266c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// in the expression. 4267c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4268c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruvoid RegexCompile::setEval(int32_t nextOp) { 4269c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *rightOperand = NULL; 4270c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *leftOperand = NULL; 4271c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (;;) { 4272c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fSetOpStack.empty()==FALSE); 4273c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t pendingSetOperation = fSetOpStack.peeki(); 4274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { 4275c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4276c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4277c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.popi(); 4278c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fSetStack.empty() == FALSE); 4279c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru rightOperand = (UnicodeSet *)fSetStack.peek(); 4280c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru switch (pendingSetOperation) { 4281c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setNegation: 4282c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru rightOperand->complement(); 4283c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4284c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setCaseClose: 4285c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: need a simple close function. Ticket 6065 4286c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru rightOperand->closeOver(USET_CASE_INSENSITIVE); 4287c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru rightOperand->removeAllStrings(); 4288c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4289c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setDifference1: 4290c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setDifference2: 4291c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.pop(); 4292c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand = (UnicodeSet *)fSetStack.peek(); 4293c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand->removeAll(*rightOperand); 4294c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete rightOperand; 4295c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4296c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setIntersection1: 4297c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setIntersection2: 4298c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.pop(); 4299c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand = (UnicodeSet *)fSetStack.peek(); 4300c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand->retainAll(*rightOperand); 4301c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete rightOperand; 4302c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4303c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setUnion: 4304c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.pop(); 4305c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand = (UnicodeSet *)fSetStack.peek(); 4306c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand->addAll(*rightOperand); 4307c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete rightOperand; 4308c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4309c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru default: 4310c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(FALSE); 4311c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4312c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4313c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4314c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4315c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4316c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruvoid RegexCompile::setPushOp(int32_t op) { 4317c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(op); 4318c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(op, *fStatus); 4319c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.push(new UnicodeSet(), *fStatus); 4320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END 4323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 4324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4325