164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// file: regexcmp.cpp 5b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 68de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// Copyright (C) 2002-2016 International Business Machines Corporation and others. 7b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// All Rights Reserved. 8b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 9b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// This file contains the ICU regular expression compiler, which is responsible 10b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// for processing a regular expression pattern into the compiled form that 11b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// is used by the match finding engine. 12b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 13b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 14b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/utypes.h" 15b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 16b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#if !UCONFIG_NO_REGULAR_EXPRESSIONS 17b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/ustring.h" 19b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/unistr.h" 20b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uniset.h" 21b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchar.h" 22b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/uchriter.h" 23b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/parsepos.h" 24b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/parseerr.h" 25b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "unicode/regex.h" 26103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf.h" 27103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf16.h" 28b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#include "patternprops.h" 2950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "putilimp.h" 30b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cmemory.h" 31b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "cstring.h" 32b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uvectr32.h" 3350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uvectr64.h" 34b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "uassert.h" 35c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru#include "uinvchar.h" 36b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 37b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "regeximp.h" 38b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "regexcst.h" // Contains state table for the regex pattern parser. 39b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // generated by a Perl script. 40b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "regexcmp.h" 41b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#include "regexst.h" 4250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "regextxt.h" 43b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 44b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 45b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 46b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_BEGIN 47b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 48b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 49b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 50b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 51b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Constructor. 52b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 53b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 54c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruRegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : 55c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fParenStack(status), fSetStack(status), fSetOpStack(status) 56b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 5750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Lazy init of all shared global sets (needed for init()'s empty text) 5850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho RegexStaticSets::initGlobals(&status); 5950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 60b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStatus = &status; 61b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 62b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat = rxp; 63b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fScanIndex = 0; 64807b6b36605a2970f69dc767fee84a1b2a31e5e3Elliott Hughes fLastChar = -1; 65b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPeekChar = -1; 66b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fLineNum = 1; 67b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fCharNum = 0; 68b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fQuoteMode = FALSE; 69b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fInBackslashQuote = FALSE; 70b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fModeFlags = fRXPat->fFlags | 0x80000000; 71b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fEOLComments = TRUE; 72b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 73b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchOpenParen = -1; 74b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchCloseParen = -1; 751b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fCaptureName = NULL; 768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fLastSetLiteral = U_SENTINEL; 77b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 78b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { 79b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru status = rxp->fDeferredStatus; 80b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 81b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 82b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 83c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chAmp = 0x26; // '&' 84c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chDash = 0x2d; // '-' 85b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 86b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 87b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 88b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 89b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Destructor 90b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 91b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 92b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruRegexCompile::~RegexCompile() { 931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert delete fCaptureName; // Normally will be NULL, but can exist if pattern 941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // compilation stops with a syntax error. 95b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 96b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 97c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { 98c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); 99c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 100c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Compile regex pattern. The state machine for rexexp pattern parsing is here. 104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The state tables are hand-written in the file regexcst.txt, 105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// and converted to the form used here by a perl 106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// script regexcst.pl 107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::compile( 110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeString &pat, // Source pat to be compiled. 111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UParseError &pp, // Error position info 112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UErrorCode &e) // Error Code 113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 114103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fRXPat->fPatternString = new UnicodeString(pat); 11550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UText patternText = UTEXT_INITIALIZER; 11650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); 117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 11850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (U_SUCCESS(e)) { 11950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compile(&patternText, pp, e); 12050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_close(&patternText); 12150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 12250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 12350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 12450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// 12550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// compile, UText mode 12650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// All the work is actually done here. 12750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// 12850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid RegexCompile::compile( 12950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UText *pat, // Source pat to be compiled. 13050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UParseError &pp, // Error position info 13150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode &e) // Error Code 13250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho{ 133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStatus = &e; 134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParseErr = &pp; 135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr = 0; 136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStack[fStackPtr] = 0; 137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // There should be no pattern stuff in the RegexPattern object. They can not be reused. 14350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0); 144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Prepare the RegexPattern object to receive the compiled pattern. 14650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus); 1471b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (U_FAILURE(*fStatus)) { 1481b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return; 1491b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; 151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; 152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Initialize the pattern scanning state machine 15550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fPatternLength = utext_nativeLength(pat); 156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uint16_t state = 1; 157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const RegexTableEl *tableEl; 158103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 159103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // UREGEX_LITERAL force entire pattern to be treated as a literal string. 160103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (fModeFlags & UREGEX_LITERAL) { 161103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fQuoteMode = TRUE; 162103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 163103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextChar(fC); // Fetch the first char from the pattern string. 165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Main loop for the regex pattern parsing state machine. 168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Runs once per state transition. 169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Each time through optionally performs, depending on the state table, 170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - an advance to the the next pattern char 171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - an action to be performed. 172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - pushing or popping a state to/from the local state return stack. 173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // file regexcst.txt is the source for the state table. The logic behind 174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // recongizing the pattern syntax is there, not here. 175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Bail out if anything has gone wrong. 178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Regex pattern parsing stops on the first error encountered. 179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(state != 0); 184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Find the state table element that matches the input char from the pattern, or the 186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // class of the input character. Start with the first table row for this 187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // state, then linearly scan forward until we find a row that matches the 188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // character. The last row for each state always matches all characters, so 189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the search will stop there, if not before. 190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tableEl = &gRuleParseStateTable[state]; 192c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", 193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); 194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { // loop through table rows belonging to this state, looking for one 196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // that matches the current input char. 197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru REGEX_SCAN_DEBUG_PRINTF((".")); 198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { 199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified an individual character, not a set, and 200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the input character is not quoted, and 201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the input character matched it. 202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass == 255) { 205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified default, match anything character class. 206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass == 254 && fC.fQuoted) { 209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified "quoted" and the char was quoted. 210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { 213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified eof and we hit eof on the input. 214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fC.fQuoted == FALSE && // char is not escaped && 219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fC.fChar != (UChar32)-1) { // char is not EOF 220103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius U_ASSERT(tableEl->fCharClass <= 137); 221c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Table row specified a character class, or set of characters, 223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and the current char matches it. 224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No match on this row, advance to the next row for this state, 229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru tableEl++; 230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru REGEX_SCAN_DEBUG_PRINTF(("\n")); 232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've found the row of the state table that matches the current input 235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // character from the rules string. 236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Perform any action specified by this row in the state table. 237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (doParseActions(tableEl->fAction) == FALSE) { 238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Break out of the state machine loop if the 239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the action signalled some kind of error, or 240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the action was to exit, occurs on normal end-of-rules-input. 241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fPushState != 0) { 245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr++; 246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fStackPtr >= kStackSize) { 247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_INTERNAL_ERROR); 248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); 249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr--; 250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStack[fStackPtr] = tableEl->fPushState; 252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // NextChar. This is where characters are actually fetched from the pattern. 256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Happens under control of the 'n' tag in the state table. 257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fNextChar) { 259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextChar(fC); 260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Get the next state from the table entry, or from the 263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // state stack if the next state was specified as "pop". 264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (tableEl->fNextState != 255) { 265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru state = tableEl->fNextState; 266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru state = fStack[fStackPtr]; 268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr--; 269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fStackPtr < 0) { 270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // state stack underflow 271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This will occur if the user pattern has mis-matched parentheses, 272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // with extra close parens. 273c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fStackPtr++; 275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 281c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 282c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Bail out if the pattern had errors. 283c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Set stack cleanup: a successful compile would have left it empty, 284c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // but errors can leave temporary sets hanging around. 285c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru while (!fSetStack.empty()) { 286c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete (UnicodeSet *)fSetStack.pop(); 287c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 288c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return; 289c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 290c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The pattern has now been read and processed, and the compiled code generated. 293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The pattern's fFrameSize so far has accumulated the requirements for 297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // storage for capture parentheses, counters, etc. that are encountered 298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // in the pattern. Add space for the two variables that are always 29950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // present in the saved state: the input string position (int64_t) and 30050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // the position in the compiled pattern. 30150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 3021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert allocateStackData(RESTACKFRAME_HDRCOUNT); 30350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 30550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Optimization pass 1: NOPs, back-references, and case-folding 30650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 30750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho stripNOPs(); 308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Get bounds for the minimum and maximum length of a string that this 311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // pattern can match. Used to avoid looking for matches in strings that 312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are too short. 313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); 315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 31750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Optimization pass 2: match start type 318c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 319c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru matchStartType(); 320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Set up fast latin-1 range sets 323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t numSets = fRXPat->fSets->size(); 325b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fSets8 = new Regex8BitSet[numSets]; 326c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Null pointer check. 327c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fRXPat->fSets8 == NULL) { 328c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru e = *fStatus = U_MEMORY_ALLOCATION_ERROR; 329c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return; 330c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i; 332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=0; i<numSets; i++) { 333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i); 334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fSets8[i].init(s); 335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// doParseAction Do some action during regex pattern parsing. 346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Called by the parse state machine. 347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Generation of the match engine PCode happens here, or 349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// in functions called from the parse actions defined here. 350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool RegexCompile::doParseActions(int32_t action) 354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool returnVal = TRUE; 356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch ((Regex_PatternParseAction)action) { 358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPatStart: 360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Start of pattern compiles to: 361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //0 SAVE 2 Fall back to position of FAIL 362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //1 jmp 3 363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //2 FAIL Stop if we ever reach here. 364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //3 NOP Dummy, so start of pattern looks the same as 365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the start of an ( grouping. 366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru //4 NOP Resreved, will be replaced by a save if there are 367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // OR | operators at the top level 3681b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STATE_SAVE, 2); 3691b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_JMP, 3); 3701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_FAIL, 0); 371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 372b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Standard open nonCapture paren action emits the two NOPs and 373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // sets up the paren stack frame. 374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru doParseActions(doOpenNonCaptureParen); 375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 376b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPatFinish: 378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've scanned to the end of the pattern 379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The end of pattern compiles to: 380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // URX_END 381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // which will stop the runtime match engine. 382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Encountering end of pattern also behaves like a close paren, 383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and forces fixups of the State Save at the beginning of the compiled pattern 384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and of any OR operations at the top level. 385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru handleCloseParen(); 387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fParenStack.size() > 0) { 388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Missing close paren in pattern. 389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // add the END operation to the compiled pattern. 3931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_END, 0); 394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Terminate the pattern compilation state machine. 396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru returnVal = FALSE; 397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOrOperator: 402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanning a '|', as in (A|B) 403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 404103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Generate code for any pending literals preceding the '|' 405103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 406103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert a SAVE operation at the start of the pattern section preceding 408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this OR at this level. This SAVE will branch the match forward 409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to the right hand side of the OR in the event that the left hand 410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // side fails to match and backtracks. Locate the position for the 411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // save from the location on the top of the parentheses stack. 412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t savePosition = fParenStack.popi(); 41350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition); 414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location 4151b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); 416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, savePosition); 417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append an JMP operation into the compiled pattern. The operand for 419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the JMP will eventually be the location following the ')' for the 420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // group. This will be patched in later, when the ')' is encountered. 4211b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_JMP, 0); 422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Push the position of the newly added JMP op onto the parentheses stack. 424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This registers if for fixup when this block's close paren is encountered. 425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append a NOP to the compiled pattern. This is the slot reserved 428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // for a SAVE in the event that there is yet another '|' following 429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // this one. 4301b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4361b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doBeginNamedCapture: 4371b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Scanning (?<letter. 4381b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // The first letter of the name will come through again under doConinueNamedCapture. 4391b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fCaptureName = new UnicodeString(); 4401b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (fCaptureName == NULL) { 4411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_MEMORY_ALLOCATION_ERROR); 4421b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4431b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 4441b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 4451b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doContinueNamedCapture: 4461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fCaptureName->append(fC.fChar); 4471b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 4481b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 4491b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doBadNamedCapture: 4501b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); 4511b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 4521b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenCaptureParen: 4541b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Open Capturing Paren, possibly named. 455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile to a 456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which later may be replaced by a save-state if the 457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parenthesized group gets a * quantifier, followed by 458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - START_CAPTURE n where n is stack frame offset to the capture group variables. 459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which may later be replaced by a save-state if there 460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is an '|' alternation within the parens. 461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Each capture group gets three slots in the save stack frame: 46350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 0: Capture Group start position (in input string being matched.) 46450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 1: Capture Group end position. 46550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 2: Start of Match-in-progress. 466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The first two locations are for a completed capture group, and are 467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // referred to by back references and the like. 468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The third location stores the capture start position when an START_CAPTURE is 469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // encountered. This will be promoted to a completed capture when (and if) the corresponding 47050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // END_CAPTURE is encountered. 471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 472103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(); 4731b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 4741b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame. 4751b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_START_CAPTURE, varsLoc); 4761b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the two NOPs. Depending on what follows in the pattern, the 480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // NOPs may be changed to SAVE_STATE or JMP ops, with a target 481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // address of the end of the parenthesized group. 482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(capturing, *fStatus); // Frame type. 484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Save the mapping from group number to stack frame variable position. 488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 4891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 4901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // If this is a named capture group, add the name->group number mapping. 4911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (fCaptureName != NULL) { 4921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t groupNumber = fRXPat->fGroupMap->size(); 4931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus); 4941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fCaptureName = NULL; // hash table takes ownership of the name (key) string. 4951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (previousMapping > 0 && U_SUCCESS(*fStatus)) { 4961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); 4971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 5001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenNonCaptureParen: 503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open non-caputuring (grouping only) Paren. 504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile to a 505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which later may be replaced by a save-state if the 506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parenthesized group gets a * quantifier, followed by 507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which may later be replaced by a save-state if there 508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is an '|' alternation within the parens. 509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 510103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(); 5111b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 5121b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the two NOPs. 516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(plain, *fStatus); // Begin a new frame. 518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenAtomicParen: 525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Open Atomic Paren. (?> 526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile to a 527c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // - NOP, which later may be replaced if the parenthesized group 528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // has a quantifier, followed by 529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - STO_SP save state stack position, so it can be restored at the ")" 530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which may later be replaced by a save-state if there 531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is an '|' alternation within the parens. 532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 533103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(); 5341b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 5351b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr. 5361b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STO_SP, varLoc); 5371b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the two NOPs. Depending on what follows in the pattern, the 541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // NOPs may be changed to SAVE_STATE or JMP ops, with a target 542b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // address of the end of the parenthesized group. 543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 544b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(atomic, *fStatus); // Frame type. 545b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP 546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 551b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenLookAhead: 552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Positive Look-ahead (?= stuff ) 553c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 554c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Note: Addition of transparent input regions, with the need to 555c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // restore the original regions when failing out of a lookahead 556c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // block, complicated this sequence. Some conbined opcodes 557c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // might make sense - or might not, lookahead aren't that common. 558c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 559c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Caution: min match length optimization knows about this 560c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // sequence; don't change without making updates there too. 561c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 563c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 1 START_LA dataLoc Saves SP, Input Pos 564c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2. STATE_SAVE 4 on failure of lookahead, goto 4 565c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3 JMP 6 continue ... 566c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 567c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4. LA_END Look Ahead failed. Restore regions. 568c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 5. BACKTRACK and back track again. 569c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 570c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 6. NOP reserved for use by quantifiers on the block. 571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-ahead can't have quantifiers, but paren stack 572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compile time conventions require the slot anyhow. 573c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 7. NOP may be replaced if there is are '|' ops in the block. 574c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 8. code for parenthesized stuff. 575c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 9. LA_END 576c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Two data slots are reserved, for saving the stack ptr and the input position. 578b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 579103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(); 5801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t dataLoc = allocateData(2); 5811b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LA_START, dataLoc); 5821b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); 5831b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3); 5841b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LA_END, dataLoc); 5851b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKTRACK, 0); 5861b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 5871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 590c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // of the NOPs. 591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(lookAhead, *fStatus); // Frame type. 593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenLookAheadNeg: 599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Negated Lookahead. (?! stuff ) 600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 601b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. START_LA dataloc 602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, 603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // // which continues with the match. 604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. NOP // Std. Open Paren sequence, for possible '|' 605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. code for parenthesized stuff. 606b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. END_LA // Cut back stack, remove saved state from step 2. 607c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. 608c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 7. END_LA // Restore match region, in case look-ahead was using 609c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // an alternate (transparent) region. 610b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 611103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(); 6121b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t dataLoc = allocateData(2); 6131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LA_START, dataLoc); 6141b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STATE_SAVE, 0); // dest address will be patched later. 6151b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 616b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 617b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 618c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // of the StateSave and NOP. 619b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 620c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fParenStack.push(negLookAhead, *fStatus); // Frame type 621b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location 622b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 623c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 62450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Instructions #5 - #7 will be added when the ')' is encountered. 625b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 626b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 627b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 628b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenLookBehind: 629b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 630b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile a (?<= look-behind open paren. 631b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 632b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 633b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 0 URX_LB_START dataLoc 634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1 URX_LB_CONT dataLoc 635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2 MinMatchLen 636b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3 MaxMatchLen 637b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4 URX_NOP Standard '(' boilerplate. 638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5 URX_NOP Reserved slot for use with '|' ops within (block). 639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 6 <code for LookBehind expression> 640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 7 URX_LB_END dataLoc # Check match len, restore input len 641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 8 URX_LA_END dataLoc # Restore stack, input pos 642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 643b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Allocate a block of matcher data, to contain (when running a match) 644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 0: Stack ptr on entry 645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1: Input Index on entry 646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2: Start index of match current match attempt. 647c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3: Original Input String len. 648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 649103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Generate match code for any pending literals. 650103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(); 651103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Allocate data space 6531b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t dataLoc = allocateData(4); 654c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit URX_LB_START 6561b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LB_START, dataLoc); 657c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit URX_LB_CONT 6591b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LB_CONT, dataLoc); 6601b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later. 6611b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later. 662c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 6631b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Emit the NOPs 6641b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 6651b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 666c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 668c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // of the URX_LB_CONT and the NOP. 669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(lookBehind, *fStatus); // Frame type 671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 673c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The final two instructions will be added when the ')' is encountered. 675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpenLookBehindNeg: 680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile a (?<! negated look-behind open paren. 682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 0 URX_LB_START dataLoc # Save entry stack, input len 685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1 URX_LBN_CONT dataLoc # Iterate possible match positions 686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2 MinMatchLen 687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3 MaxMatchLen 688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4 continueLoc (9) 689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5 URX_NOP Standard '(' boilerplate. 690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 6 URX_NOP Reserved slot for use with '|' ops within (block). 691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 7 <code for LookBehind expression> 692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL 693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 9 ... 694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Allocate a block of matcher data, to contain (when running a match) 696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 0: Stack ptr on entry 697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1: Input Index on entry 698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2: Start index of match current match attempt. 699c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3: Original Input String len. 700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 701103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Generate match code for any pending literals. 702103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(); 703103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Allocate data space 7051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t dataLoc = allocateData(4); 706c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit URX_LB_START 7081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LB_START, dataLoc); 709c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit URX_LBN_CONT 7111b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LBN_CONT, dataLoc); 7121b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later. 7131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later. 7141b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later. 715c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 7161b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Emit the NOPs 7171b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 7181b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 719c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 721c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // of the URX_LB_CONT and the NOP. 722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); // Match mode state 723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(lookBehindN, *fStatus); // Frame type 724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 726c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 727b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The final two instructions will be added when the ')' is encountered. 728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doConditionalExpr: 732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Conditionals such as (?(1)a:b) 733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPerlInline: 734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. 735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_UNIMPLEMENTED); 736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doCloseParen: 740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru handleCloseParen(); 741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fParenStack.size() <= 0) { 742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Extra close paren, or missing open paren. 743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNOP: 748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBadOpenParenType: 752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doRuleError: 753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_RULE_SYNTAX); 754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doMismatchedParenErr: 758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPlus: 762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Normal '+' compiles to 763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. stuff to be repeated (already built) 764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. jmp-sav 1 765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. ... 766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Or, if the item to be repeated can match a zero length string, 768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STO_INP_LOC data-loc 769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of stuff to be repeated 770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. JMP_SAV_X 2 771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. ... 772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Or, if the item to be repeated is simple 775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. Item to be repeated. 776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. LOOP_SR_I set number (assuming repeated item is a set ref) 777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. LOOP_C stack location 778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(FALSE); // location of item #1 780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t frameLoc; 781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 782b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for simple constructs, which may get special optimized code. 783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (topLoc == fRXPat->fCompiledPat->size() - 1) { 78450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_SETREF) { 787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit optimized code for [char set]+ 7881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 7891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert frameLoc = allocateStackData(1); 7901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LOOP_C, frameLoc); 791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_DOTANY || 795c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 796c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit Optimized code for .+ operations. 7981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); 799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 800c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. 801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopOpI |= 1; 802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 803c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fModeFlags & UREGEX_UNIX_LINES) { 804c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru loopOpI |= 2; 805c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 8061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(loopOpI); 8071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert frameLoc = allocateStackData(1); 8081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LOOP_C, frameLoc); 809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 814b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // General case. 815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for minimum match length of zero, which requires 817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // extra loop-breaking code. 818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { 819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Zero length match is possible. 820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the code sequence that can handle it. 821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topLoc); 8221b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert frameLoc = allocateStackData(1); 823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 8241b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t op = buildOp(URX_STO_INP_LOC, frameLoc); 825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 8271b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_JMP_SAV_X, topLoc+1); 828b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Simpler code when the repeated body must match something non-empty 8301b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_JMP_SAV, topLoc); 831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNGPlus: 836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Non-greedy '+?' compiles to 837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. stuff to be repeated (already built) 838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. state-save 1 839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. ... 840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(FALSE); 8421b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STATE_SAVE, topLoc); 843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doOpt: 848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Normal (greedy) ? quantifier. 849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. state save 3 851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of optional block 852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. ... 853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert the state save into the compiled pattern, and we're done. 854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveStateLoc = blockTopLoc(TRUE); 8561b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); 857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNGOpt: 862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Non-greedy ?? quantifier 863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compiles to 864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. jmp 4 865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of optional block 866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3 jmp 5 867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. state save 2 868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5 ... 869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This code is less than ideal, with two jmps instead of one, because we can only 870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // insert one instruction at the top of the block being iterated. 871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmp1_loc = blockTopLoc(TRUE); 873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmp2_loc = fRXPat->fCompiledPat->size(); 874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 8751b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1); 876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); 877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 8781b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_JMP, jmp2_loc+2); 879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 8801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STATE_SAVE, jmp1_loc+1); 881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doStar: 886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Normal (greedy) * quantifier. 887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STATE_SAVE 4 889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of stuff being iterated over 890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. JMP_SAV 2 891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. ... 892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Or, if the body is a simple [Set], 894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. LOOP_SR_I set number 895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. LOOP_C stack location 896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // ... 897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 898c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Or if this is a .* 899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. LOOP_DOT_I (. matches all mode flag) 900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. LOOP_C stack location 901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Or, if the body can match a zero-length string, to inhibit infinite loops, 903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STATE_SAVE 5 904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. STO_INP_LOC data-loc 905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. body of stuff 906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. JMP_SAV_X 2 907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. ... 908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // location of item #1, the STATE_SAVE 910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(FALSE); 911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = -1; 912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 913b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for simple *, where the construct being repeated 914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compiled to single opcode, and might be optimizable. 915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (topLoc == fRXPat->fCompiledPat->size() - 1) { 91650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_SETREF) { 919c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Emit optimized code for a [char set]* 9201b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 9221b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert dataLoc = allocateStackData(1); 9231b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LOOP_C, dataLoc); 924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_DOTANY || 928c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 929c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit Optimized code for .* operations. 9311b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0); 932b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. 934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopOpI |= 1; 935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 936c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { 937c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru loopOpI |= 2; 938c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 9401b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert dataLoc = allocateStackData(1); 9411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LOOP_C, dataLoc); 942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit general case code for this * 947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The optimizations did not apply. 948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 949b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveStateLoc = blockTopLoc(TRUE); 9501b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1); 951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Check for minimum match length of zero, which requires 953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // extra loop-breaking code. 954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { 955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(saveStateLoc); 9561b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert dataLoc = allocateStackData(1); 957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 9581b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t op = buildOp(URX_STO_INP_LOC, dataLoc); 959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); 9601b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2); 961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 962c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Locate the position in the compiled pattern where the match will continue 964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // after completing the *. (4 or 5 in the comment above) 965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 9671b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Put together the save state op and store it into the compiled code. 9681b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc); 969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. 9721b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(jmpOp); 973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNGStar: 977b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Non-greedy *? quantifier 978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compiles to 979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. JMP 3 980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of stuff being iterated over 981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. STATE_SAVE 2 982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4 ... 983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. 985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. 9861b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t jmpOp = buildOp(URX_JMP, saveLoc); 987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); 9881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STATE_SAVE, jmpLoc+1); 989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntervalInit: 994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The '{' opening an interval quantifier was just scanned. 995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Init the counter varaiables that will accumulate the values as the digits 996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are scanned. 997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalLow = 0; 998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalUpper = -1; 999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntevalLowerDigit: 1002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned a digit from the lower value of an {lower,upper} interval 1003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t digitValue = u_charDigitValue(fC.fChar); 1005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(digitValue >= 0); 10061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int64_t val = (int64_t)fIntervalLow*10 + digitValue; 10071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (val > INT32_MAX) { 1008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_NUMBER_TOO_BIG); 10091b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else { 10101b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fIntervalLow = (int32_t)val; 1011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntervalUpperDigit: 1016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned a digit from the upper value of an {lower,upper} interval 1017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalUpper < 0) { 1019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalUpper = 0; 1020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t digitValue = u_charDigitValue(fC.fChar); 1022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(digitValue >= 0); 10231b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int64_t val = (int64_t)fIntervalUpper*10 + digitValue; 10241b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (val > INT32_MAX) { 1025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_NUMBER_TOO_BIG); 10261b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else { 10271b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fIntervalUpper = (int32_t)val; 1028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntervalSame: 1033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scanned a single value interval like {27}. Upper = Lower. 1034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fIntervalUpper = fIntervalLow; 1035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doInterval: 1038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Finished scanning a normal {lower,upper} interval. Generate the code for it. 1039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (compileInlineInterval() == FALSE) { 1040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPossessiveInterval: 1045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. 1046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Remember the loc for the top of the block being looped over. 1048c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // (Can not reserve a slot in the compiled pattern at this time, because 1049c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // compileInterval needs to reserve also, and blockTopLoc can only reserve 1050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // once per block.) 1051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(FALSE); 1052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Produce normal looping code. 1054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Surround the just-emitted normal looping code with a STO_SP ... LD_SP 1057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // just as if the loop was inclosed in atomic parentheses. 1058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // First the STO_SP before the start of the loop 1060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topLoc); 10611b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 10621b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t varLoc = allocateData(1); // Reserve a data location for saving the 10631b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t op = buildOp(URX_STO_SP, varLoc); 1064b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 1065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 106650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); 1067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); 1068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopOp++; // point LoopOp after the just-inserted STO_SP 1069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->push(loopOp, *fStatus); 1070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Then the LD_SP after the end of the loop 10721b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LD_SP, varLoc); 1073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doNGInterval: 1078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. 1079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); 1080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doIntervalError: 1083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_BAD_INTERVAL); 1084b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1085b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doLiteralChar: 1087c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We've just scanned a "normal" character from the pattern, 1088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru literalChar(fC.fChar); 1089b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1092c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doEscapedLiteralChar: 1093c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We've just scanned an backslashed escaped character with no 1094c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // special meaning. It represents itself. 1095c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1096c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1097c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1098c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1099c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1100c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru literalChar(fC.fChar); 1101c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1102c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1103b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doDotAny: 1105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // scanned a ".", match any single character. 1106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1107103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 1108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_DOTALL) { 11091b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_DOTANY_ALL, 0); 1110c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if (fModeFlags & UREGEX_UNIX_LINES) { 11111b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_DOTANY_UNIX, 0); 1112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 11131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_DOTANY, 0); 1114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1118c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doCaret: 1119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1120103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 1121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 11221b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_CARET, 0); 1123c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 11241b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_CARET_M, 0); 1125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 11261b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_CARET, 0); // Only testing true start of input. 1127c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 11281b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_CARET_M_UNIX, 0); 1129c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1133c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doDollar: 1134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1135103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 1136c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 11371b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_DOLLAR, 0); 1138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 11391b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_DOLLAR_M, 0); 1140c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 11411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_DOLLAR_D, 0); 1142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 11431b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_DOLLAR_MD, 0); 1144c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashA: 1149103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 11501b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_CARET, 0); 1151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashB: 1154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru #if UCONFIG_NO_BREAK_ITERATION==1 1156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_UWORD) { 1157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_UNSUPPORTED_ERROR); 1158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru #endif 1160103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 1161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 11621b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(op, 1); 1163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashb: 1167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru #if UCONFIG_NO_BREAK_ITERATION==1 1169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_UWORD) { 1170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_UNSUPPORTED_ERROR); 1171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru #endif 1173103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 1174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 11751b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(op, 0); 1176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashD: 1180103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 11811b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_D, 1); 1182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashd: 1185103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 11861b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_D, 0); 1187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashG: 1190103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 11911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_G, 0); 11921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 11931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 11941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doBackslashH: 11951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fixLiterals(FALSE); 11961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_H, 1); 11971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 11981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 11991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doBackslashh: 12001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fixLiterals(FALSE); 12011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_H, 0); 12021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 12031b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 12041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doBackslashR: 12051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fixLiterals(FALSE); 12061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_R, 0); 1207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1208b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashS: 1210103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 12111b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET); 1212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashs: 1215103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 12161b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET); 12171b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 12181b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 12191b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doBackslashV: 12201b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fixLiterals(FALSE); 12211b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_V, 1); 12221b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 12231b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 12241b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doBackslashv: 12251b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fixLiterals(FALSE); 12261b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_V, 0); 1227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashW: 1230103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 12311b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET); 1232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashw: 1235103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 12361b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STATIC_SETREF, URX_ISWORD_SET); 1237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashX: 1240103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 12411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_X, 0); 1242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashZ: 1246103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 12471b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_DOLLAR, 0); 1248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackslashz: 1251103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 12521b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKSLASH_Z, 0); 1253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doEscapeError: 1256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doExit: 1260103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 1261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru returnVal = FALSE; 1262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doProperty: 1265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1266103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 1267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *theSet = scanProp(); 1268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru compileSet(theSet); 1269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1272c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doNamedChar: 1273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c = scanNamedChar(); 1275c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru literalChar(c); 1276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBackRef: 1281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // BackReference. Somewhat unusual in that the front-end can not completely parse 1282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the regular expression, because the number of digits to be consumed 1283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // depends on the number of capture groups that have been defined. So 1284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we have to do it here instead. 1285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t numCaptureGroups = fRXPat->fGroupMap->size(); 1287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t groupNum = 0; 1288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = fC.fChar; 1289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 1291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop once per digit, for max allowed number of digits in a back reference. 1292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t digit = u_charDigitValue(c); 1293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru groupNum = groupNum * 10 + digit; 1294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (groupNum >= numCaptureGroups) { 1295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c = peekCharLL(); 1298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) { 1299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextCharLL(); 1302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Scan of the back reference in the source regexp is complete. Now generate 1305c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the compiled code for it. 1306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Because capture groups can be forward-referenced by back-references, 1307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // we fill the operand with the capture group number. At the end 1308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of compilation, it will be changed to the variable's location. 1309103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence, 1310103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // and shouldn't enter this code path at all. 1311103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 1312b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 13131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKREF_I, groupNum); 1314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 13151b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKREF, groupNum); 1316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 13201b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doBeginNamedBackRef: 13211b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(fCaptureName == NULL); 13221b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fCaptureName = new UnicodeString; 13231b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (fCaptureName == NULL) { 13241b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_MEMORY_ALLOCATION_ERROR); 13251b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 13261b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 13271b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 13281b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doContinueNamedBackRef: 13291b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fCaptureName->append(fC.fChar); 13301b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 1331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 13321b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doCompleteNamedBackRef: 13331b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert { 13341b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName); 13351b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (groupNumber == 0) { 13361b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Group name has not been defined. 13371b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Could be a forward reference. If we choose to support them at some 13381b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // future time, extra mechanism will be required at this point. 13391b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INVALID_CAPTURE_GROUP_NAME); 13401b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else { 13411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Given the number, handle identically to a \n numbered back reference. 13421b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // See comments above, under doBackRef 13431b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fixLiterals(FALSE); 13441b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 13451b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKREF_I, groupNumber); 13461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else { 13471b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKREF, groupNumber); 13481b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 13491b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 13501b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert delete fCaptureName; 13511b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fCaptureName = NULL; 13521b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 13531b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 13541b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 1355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPossessivePlus: 1356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Possessive ++ quantifier. 1357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 1358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STO_SP 1359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. body of stuff being iterated over 1360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. STATE_SAVE 5 1361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. JMP 2 1362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. LD_SP 1363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 6. ... 1364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note: TODO: This is pretty inefficient. A mass of saved state is built up 1366c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 1367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the STO_SP 1370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(TRUE); 13711b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 13721b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t op = buildOp(URX_STO_SP, stoLoc); 1373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 1374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1375b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the STATE_SAVE 13761b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); 1377c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the JMP 13791b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_JMP, topLoc+1); 1380b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the LD_SP 13821b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LD_SP, stoLoc); 1383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPossessiveStar: 1387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Possessive *+ quantifier. 1388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 1389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STO_SP loc 1390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. STATE_SAVE 5 1391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. body of stuff being iterated over 1392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. JMP 2 1393b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. LD_SP loc 1394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 6 ... 1395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: do something to cut back the state stack each time through the loop. 1396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Reserve two slots at the top of the block. 1398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(TRUE); 1399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topLoc); 1400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // emit STO_SP loc 14021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 14031b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t op = buildOp(URX_STO_SP, stoLoc); 1404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 1405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the SAVE_STATE 5 1407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t L7 = fRXPat->fCompiledPat->size()+1; 14081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert op = buildOp(URX_STATE_SAVE, L7); 1409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1411c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Append the JMP operation. 14121b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_JMP, topLoc+1); 1413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the LD_SP loc 14151b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LD_SP, stoLoc); 1416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doPossessiveOpt: 1420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Possessive ?+ quantifier. 1421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compiles to 1422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. STO_SP loc 1423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. SAVE_STATE 5 1424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. body of optional block 1425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. LD_SP loc 1426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. ... 1427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Reserve two slots at the top of the block. 1430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topLoc = blockTopLoc(TRUE); 1431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topLoc); 1432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the STO_SP 14341b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr. 14351b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t op = buildOp(URX_STO_SP, stoLoc); 1436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc); 1437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the SAVE_STATE 1439b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 14401b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert op = buildOp(URX_STATE_SAVE, continueLoc); 1441b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1442b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1443b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Emit the LD_SP 14441b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LD_SP, stoLoc); 1445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBeginMatchMode: 1450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNewModeFlags = fModeFlags; 1451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fSetModeFlag = TRUE; 1452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doMatchMode: // (?i) and similar 1455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t bit = 0; 1457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (fC.fChar) { 1458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; 1459c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; 1460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; 1461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; 1462c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; 1463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; 1464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; 1465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 0x2d: /* '-' */ fSetModeFlag = FALSE; break; 1466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 1467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); // Should never happen. Other chars are filtered out 1468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // by the scanner. 1469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fSetModeFlag) { 1471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNewModeFlags |= bit; 1472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fNewModeFlags &= ~bit; 1474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doSetMatchMode: 1479103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Emit code to match any pending literals, using the not-yet changed match mode. 1480103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(); 1481103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 1482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've got a (?i) or similar. The match mode is being changed, but 1483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the change is not scoped to a parenthesized block. 1484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fNewModeFlags < 0); 1485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fModeFlags = fNewModeFlags; 1486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doMatchModeParen: 1491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We've got a (?i: or similar. Begin a parenthesized block, save old 1492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // mode flags so they can be restored at the close of the block. 1493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Compile to a 1495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which later may be replaced by a save-state if the 1496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parenthesized group gets a * quantifier, followed by 1497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // - NOP, which may later be replaced by a save-state if there 1498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is an '|' alternation within the parens. 1499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 1500103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 15011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 15021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_NOP, 0); 1503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // On the Parentheses stack, start a new frame and add the postions 1505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the two NOPs (a normal non-capturing () frame, except for the 1506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // saving of the orignal mode flags.) 1507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fModeFlags, *fStatus); 1508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(flags, *fStatus); // Frame Marker 1509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP 1510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Set the current mode flags to the new values. 1513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fNewModeFlags < 0); 1514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fModeFlags = fNewModeFlags; 1515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doBadModeFlag: 1519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_INVALID_FLAG); 1520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case doSuppressComments: 1523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We have just scanned a '(?'. We now need to prevent the character scanner from 1524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // treating a '#' as a to-the-end-of-line comment. 1525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (This Perl compatibility just gets uglier and uglier to do...) 1526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fEOLComments = FALSE; 1527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1530c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetAddAmp: 1531c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1532c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1533c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(chAmp); 1534c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1535c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1536c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1537c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetAddDash: 1538c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1539c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1540c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(chDash); 1541c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1542c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1543c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1544c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_s: 1545c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1546c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1547c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1548c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1549c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1550c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1551c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_S: 1552c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1553c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1554c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1555c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru SSet.complement(); 1556c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(SSet); 1557c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1558c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1559c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1560c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_d: 1561c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1562c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1563c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO - make a static set, ticket 6058. 1564c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, *fStatus); 1565c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1566c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1567c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1568c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_D: 1569c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1570c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1571c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet digits; 1572c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO - make a static set, ticket 6058. 1573c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 1574c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru digits.complement(); 1575c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(digits); 1576c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1577c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1578c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 15791b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doSetBackslash_h: 15801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert { 15811b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 15821b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert UnicodeSet h; 15831b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); 15841b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert h.add((UChar32)9); // Tab 15851b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert set->addAll(h); 15861b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 15871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 15881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 15891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doSetBackslash_H: 15901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert { 15911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 15921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert UnicodeSet h; 15931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); 15941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert h.add((UChar32)9); // Tab 15951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert h.complement(); 15961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert set->addAll(h); 15971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 15981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 15991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 16001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doSetBackslash_v: 16011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert { 16021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 16031b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert set->add((UChar32)0x0a, (UChar32)0x0d); // add range 16041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert set->add((UChar32)0x85); 16051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert set->add((UChar32)0x2028, (UChar32)0x2029); 16061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 16071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 16081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 16091b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case doSetBackslash_V: 16101b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert { 16111b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 16121b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert UnicodeSet v; 16131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert v.add((UChar32)0x0a, (UChar32)0x0d); // add range 16141b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert v.add((UChar32)0x85); 16151b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert v.add((UChar32)0x2028, (UChar32)0x2029); 16161b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert v.complement(); 16171b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert set->addAll(v); 16181b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 16191b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 16201b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 1621c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_w: 1622c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1623c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1624c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1625c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1626c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1627c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1628c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBackslash_W: 1629c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1630c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1631c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1632c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru SSet.complement(); 1633c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->addAll(SSet); 1634c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1635c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1636c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1637c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBegin: 1638103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); 1639c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.push(new UnicodeSet(), *fStatus); 1640c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setStart, *fStatus); 1641c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1642c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1643c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1644c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1645c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1646c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBeginDifference1: 1647c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned something like [[abc]-[ 1648c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Set up a new UnicodeSet for the set beginning with the just-scanned '[' 1649c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Push a Difference operator, which will cause the new set to be subtracted from what 1650c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // went before once it is created. 1651c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setDifference1); 1652c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setStart, *fStatus); 1653c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1654c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1655c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1656c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1657c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1658c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBeginIntersection1: 1659c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned something like [[abc]&[ 1660c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Need both the '&' operator and the open '[' operator. 1661c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setIntersection1); 1662c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setStart, *fStatus); 1663c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1664c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1665c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1666c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1667c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1668c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetBeginUnion: 1669c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned something like [[abc][ 1670c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Need to handle the union operation explicitly [[abc] | [ 1671c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setUnion); 1672c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setStart, *fStatus); 1673c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1674c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1675c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1676c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1677c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1678c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetDifference2: 1679c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned something like [abc-- 1680c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Consider this to unambiguously be a set difference operator. 1681c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setDifference2); 1682c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1683c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1684c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetEnd: 1685c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Have encountered the ']' that closes a set. 1686c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Force the evaluation of any pending operations within this set, 1687c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // leave the completed set on the top of the set stack. 1688c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(setEnd); 1689b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru U_ASSERT(fSetOpStack.peeki()==setStart); 1690b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru fSetOpStack.popi(); 1691c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1692c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1693c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetFinish: 1694c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1695c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Finished a complete set expression, including all nested sets. 1696c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The close bracket has already triggered clearing out pending set operators, 1697c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the operator stack should be empty and the operand stack should have just 1698c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // one entry, the result set. 1699c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fSetOpStack.empty()); 1700c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); 1701c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fSetStack.empty()); 1702c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru compileSet(theSet); 1703c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1704c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1705fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1706c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetIntersection2: 1707c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Have scanned something like [abc&& 1708c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setPushOp(setIntersection2); 1709c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1710c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1711c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetLiteral: 1712c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Union the just-scanned literal character into the set being built. 1713c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This operation is the highest precedence set operation, so we can always do 1714c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // it immediately, without waiting to see what follows. It is necessary to perform 1715c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // any pending '-' or '&' operation first, because these have the same precedence 1716fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // as union-ing in a literal' 1717c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1718c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(setUnion); 1719c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1720c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(fC.fChar); 1721c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastSetLiteral = fC.fChar; 1722c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1723c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1724c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1725c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetLiteralEscaped: 1726c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // A back-slash escaped literal character was encountered. 1727c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Processing is the same as with setLiteral, above, with the addition of 1728c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the optional check for errors on escaped ASCII letters. 1729c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1730c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1731c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1732c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1733c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1734c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1735c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(setUnion); 1736c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1737c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(fC.fChar); 1738c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastSetLiteral = fC.fChar; 1739c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1740c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1741c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1742c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetNamedChar: 1743c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scanning a \N{UNICODE CHARACTER NAME} 1744c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Aside from the source of the character, the processing is identical to doSetLiteral, 1745c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // above. 1746c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1747c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c = scanNamedChar(); 1748c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(setUnion); 1749c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1750c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(c); 1751c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastSetLiteral = c; 1752c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1753c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1754c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1755c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetNamedRange: 1756c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned literal-\N{CHAR NAME}. Add the range to the set. 1757c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The left character is already in the set, and is saved in fLastSetLiteral. 1758c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The right side needs to be picked up, the scan is at the 'N'. 1759c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Lower Limit > Upper limit being an error matches both Java 1760c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // and ICU UnicodeSet behavior. 1761c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1762c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 c = scanNamedChar(); 17638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) { 1764c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_INVALID_RANGE); 1765c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1766c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1767c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(fLastSetLiteral, c); 1768c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastSetLiteral = c; 1769c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1770c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1771c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1772c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 177350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case doSetNegate: 1774c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scanned a '^' at the start of a set. 1775c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Push the negation operator onto the set op stack. 1776c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // A twist for case-insensitive matching: 1777c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the case closure operation must happen _before_ negation. 1778c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // But the case closure operation will already be on the stack if it's required. 1779c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This requires checking for case closure, and swapping the stack order 1780c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // if it is present. 1781c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1782c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t tosOp = fSetOpStack.peeki(); 1783c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (tosOp == setCaseClose) { 1784c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.popi(); 1785c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setNegation, *fStatus); 1786c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setCaseClose, *fStatus); 1787c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 1788c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(setNegation, *fStatus); 1789c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1790c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1791c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1792c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1793c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetNoCloseError: 1794c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_MISSING_CLOSE_BRACKET); 1795c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1796c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1797c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetOpError: 1798c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal. 1799c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1800c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1801c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetPosixProp: 1802c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1803c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = scanPosixProp(); 1804c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (s != NULL) { 1805c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1806c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru tos->addAll(*s); 1807c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete s; 1808c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } // else error. scanProp() reported the error status already. 1809c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1810c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1811fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1812c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetProp: 1813c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scanned a \p \P within [brackets]. 1814c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 1815c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = scanProp(); 1816c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (s != NULL) { 1817c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1818c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru tos->addAll(*s); 1819c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete s; 1820c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } // else error. scanProp() reported the error status already. 1821c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1822c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1823c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1824c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1825c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case doSetRange: 1826c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // We have scanned literal-literal. Add the range to the set. 1827c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The left character is already in the set, and is saved in fLastSetLiteral. 1828c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The right side is the current character. 1829c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Lower Limit > Upper limit being an error matches both Java 1830c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // and ICU UnicodeSet behavior. 1831c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 18328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 18338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) { 1834fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius error(U_REGEX_INVALID_RANGE); 1835c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1836c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1837c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru s->add(fLastSetLiteral, fC.fChar); 1838c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 1839c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 1840c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 1841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 1842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 1843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_INTERNAL_ERROR); 1844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 1845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 1848b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru returnVal = FALSE; 1849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return returnVal; 1852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1854b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1855b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1858b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// literalChar We've encountered a literal character from the pattern, 1859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// or an escape sequence that reduces to a character. 1860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Add it to the string containing all literal chars/strings from 1861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// the pattern. 1862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::literalChar(UChar32 c) { 1865103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fLiteralChars.append(c); 1866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1869b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1870b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1871b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// fixLiterals When compiling something that can follow a literal 1872103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// string in a pattern, emit the code to match the 1873103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// accumulated literal string. 1874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Optionally, split the last char of the string off into 1876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// a single "ONE_CHAR" operation, so that quantifiers can 1877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// apply to that char alone. Example: abc* 1878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The * must apply to the 'c' only. 1879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 1881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::fixLiterals(UBool split) { 1882103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 1883103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // If no literal characters have been scanned but not yet had code generated 1884103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // for them, nothing needs to be done. 1885103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (fLiteralChars.length() == 0) { 1886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1889103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1890103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1891103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 1892fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Split: We need to ensure that the last item in the compiled pattern 1893103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // refers only to the last literal scanned in the pattern, so that 1894103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // quantifiers (*, +, etc.) affect only it, and not a longer string. 1895103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Split before case folding for case insensitive matches. 1896103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 1897103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (split) { 1898103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fLiteralChars.truncate(indexOfLastCodePoint); 1899103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string. 1900103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Note that the truncated literal string may be empty, in which case 1901103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // nothing will be emitted. 1902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1903103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal. 1904103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(FALSE); // Second recursive call, code for the final code point. 1905b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 1906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1908103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // If we are doing case-insensitive matching, case fold the string. This may expand 1909103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // the string, e.g. the German sharp-s turns into "ss" 1910103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1911103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fLiteralChars.foldCase(); 1912103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1913103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1914103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1915103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 1916103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (indexOfLastCodePoint == 0) { 1917103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Single character, emit a URX_ONECHAR op to match it. 1918fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 1919103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { 19201b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_ONECHAR_I, lastCodePoint); 1921103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 19221b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_ONECHAR, lastCodePoint); 1923103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 1924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 1925103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Two or more chars, emit a URX_STRING to match them. 19261b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) { 19271b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_PATTERN_TOO_BIG); 19281b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 1929103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 19301b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STRING_I, fRXPat->fLiteralText.length()); 1931103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 1932103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // TODO here: add optimization to split case sensitive strings of length two 1933103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // into two single char ops, for efficiency. 19341b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STRING, fRXPat->fLiteralText.length()); 1935103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 19361b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_STRING_LEN, fLiteralChars.length()); 1937fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1938103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Add this string into the accumulated strings of the compiled pattern. 1939103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fRXPat->fLiteralText.append(fLiteralChars); 1940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 1941103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 1942103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fLiteralChars.remove(); 1943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 1944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 19461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertint32_t RegexCompile::buildOp(int32_t type, int32_t val) { 19471b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (U_FAILURE(*fStatus)) { 19481b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return 0; 19491b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 19501b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (type < 0 || type > 255) { 19511b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(FALSE); 19521b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INTERNAL_ERROR); 19531b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert type = URX_RESERVED_OP; 19541b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 19551b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (val > 0x00ffffff) { 19561b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(FALSE); 19571b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INTERNAL_ERROR); 19581b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert val = 0; 19591b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 19601b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (val < 0) { 19611b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) { 19621b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(FALSE); 19631b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INTERNAL_ERROR); 19641b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return -1; 19651b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 19661b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (URX_TYPE(val) != 0xff) { 19671b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(FALSE); 19681b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INTERNAL_ERROR); 19691b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return -1; 19701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 19711b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert type = URX_RESERVED_OP_N; 19721b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 19731b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return (type << 24) | val; 19741b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert} 19751b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 1976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 19771b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert//------------------------------------------------------------------------------ 19781b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// 19791b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// appendOp() Append a new instruction onto the compiled pattern 19801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// Includes error checking, limiting the size of the 19811b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// pattern to lengths that can be represented in the 19821b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// 24 bit operand field of an instruction. 19831b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// 19841b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert//------------------------------------------------------------------------------ 19851b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertvoid RegexCompile::appendOp(int32_t op) { 19861b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (U_FAILURE(*fStatus)) { 19871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return; 19881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 19891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fRXPat->fCompiledPat->addElement(op, *fStatus); 19901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) { 19911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_PATTERN_TOO_BIG); 19921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 19931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert} 1994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 19951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertvoid RegexCompile::appendOp(int32_t type, int32_t val) { 19961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(buildOp(type, val)); 19971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert} 1998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 1999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// insertOp() Insert a slot for a new opcode into the already 2003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// compiled pattern code. 2004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Fill the slot with a NOP. Our caller will replace it 2006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// with what they really wanted. 2007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::insertOp(int32_t where) { 201050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UVector64 *code = fRXPat->fCompiledPat; 2011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(where>0 && where < code->size()); 2012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 20131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t nop = buildOp(URX_NOP, 0); 2014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru code->insertElementAt(nop, where, *fStatus); 2015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Walk through the pattern, looking for any ops with targets that 2017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // were moved down by the insert. Fix them. 2018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; 2019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=0; loc<code->size(); loc++) { 202050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)code->elementAti(loc); 2021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType = URX_TYPE(op); 2022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opValue = URX_VAL(op); 2023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((opType == URX_JMP || 2024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_JMPX || 2025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_STATE_SAVE || 2026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_CTR_LOOP || 2027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_CTR_LOOP_NG || 2028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_JMP_SAV || 2029c53bf83a40a6888f5b246a73f13f6c919de1f5f9claireho opType == URX_JMP_SAV_X || 2030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType == URX_RELOC_OPRND) && opValue > where) { 2031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Target location for this opcode is after the insertion point and 2032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // needs to be incremented to adjust for the insertion. 2033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opValue++; 20341b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert op = buildOp(opType, opValue); 2035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru code->setElementAt(op, loc); 2036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Now fix up the parentheses stack. All positive values in it are locations in 2040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) 2041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=0; loc<fParenStack.size(); loc++) { 2042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t x = fParenStack.elementAti(loc); 2043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(x < code->size()); 2044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (x>where) { 2045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru x++; 2046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fParenStack.setElementAt(x, loc); 2047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fMatchCloseParen > where) { 2051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchCloseParen++; 2052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fMatchOpenParen > where) { 2054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchOpenParen++; 2055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 20591b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert//------------------------------------------------------------------------------ 20601b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// 20611b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// allocateData() Allocate storage in the matcher's static data area. 20621b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// Return the index for the newly allocated data. 20631b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// The storage won't actually exist until we are running a match 20641b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// operation, but the storage indexes are inserted into various 20651b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// opcodes while compiling the pattern. 20661b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// 20671b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert//------------------------------------------------------------------------------ 20681b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertint32_t RegexCompile::allocateData(int32_t size) { 20691b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (U_FAILURE(*fStatus)) { 20701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return 0; 20711b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 20721b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) { 20731b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INTERNAL_ERROR); 20741b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return 0; 20751b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 20761b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t dataIndex = fRXPat->fDataSize; 20771b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fRXPat->fDataSize += size; 20781b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (fRXPat->fDataSize >= 0x00fffff0) { 20791b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INTERNAL_ERROR); 20801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 20811b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return dataIndex; 20821b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert} 20831b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 20841b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 20851b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert//------------------------------------------------------------------------------ 20861b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// 20871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// allocateStackData() Allocate space in the back-tracking stack frame. 20881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// Return the index for the newly allocated data. 20891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// The frame indexes are inserted into various 20901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// opcodes while compiling the pattern, meaning that frame 20911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// size must be restricted to the size that will fit 20921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// as an operand (24 bits). 20931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// 20941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert//------------------------------------------------------------------------------ 20951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertint32_t RegexCompile::allocateStackData(int32_t size) { 20961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (U_FAILURE(*fStatus)) { 20971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return 0; 20981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 20991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) { 21001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_INTERNAL_ERROR); 21011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return 0; 21021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 21031b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t dataIndex = fRXPat->fFrameSize; 21041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fRXPat->fFrameSize += size; 21051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (fRXPat->fFrameSize >= 0x00fffff0) { 21061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_PATTERN_TOO_BIG); 21071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 21081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return dataIndex; 21091b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert} 21101b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 2111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// blockTopLoc() Find or create a location in the compiled pattern 2115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// at the start of the operation or block that has 2116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// just been compiled. Needed when a quantifier (* or 2117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// whatever) appears, and we need to add an operation 2118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// at the start of the thing being quantified. 2119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// (Parenthesized Blocks) have a slot with a NOP that 2121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// is reserved for this purpose. .* or similar don't 2122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// and a slot needs to be added. 2123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// parameter reserveLoc : TRUE - ensure that there is space to add an opcode 2125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// at the returned location. 2126c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// FALSE - just return the address, 2127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// do not reserve a location there. 2128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t RegexCompile::blockTopLoc(UBool reserveLoc) { 2131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t theLoc; 2132103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius fixLiterals(TRUE); // Emit code for any pending literals. 2133103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // If last item was a string, emit separate op for the its last char. 2134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fRXPat->fCompiledPat->size() == fMatchCloseParen) 2135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The item just processed is a parenthesized block. 2137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru theLoc = fMatchOpenParen; // A slot is already reserved for us. 2138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(theLoc > 0); 2139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); 2140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 2142103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Item just compiled is a single thing, a ".", or a single char, a string or a set reference. 2143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No slot for STATE_SAVE was pre-reserved in the compiled code. 2144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We need to make space now. 2145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru theLoc = fRXPat->fCompiledPat->size()-1; 2146103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc); 2147103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { 2148103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Strings take two opcode, we want the position of the first one. 2149103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // We can have a string at this point if a single character case-folded to two. 2150103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius theLoc--; 2151103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 2152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (reserveLoc) { 21531b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t nop = buildOp(URX_NOP, 0); 2154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); 2155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return theLoc; 2158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// handleCloseParen When compiling a close paren, we need to go back 2165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// and fix up any JMP or SAVE operations within the 2166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// parenthesized block that need to target the end 2167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// of the block. The locations of these are kept on 2168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// the paretheses stack. 2169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// This function is called both when encountering a 2171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// real ) and at the end of the pattern. 2172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::handleCloseParen() { 2175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patIdx; 2176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patOp; 2177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fParenStack.size() <= 0) { 2178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MISMATCHED_PAREN); 2179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2182103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Emit code for any pending literals. 2183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fixLiterals(FALSE); 2184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Fixup any operations within the just-closed parenthesized group 2186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // that need to reference the end of the (block). 2187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (The first one popped from the stack is an unused slot for 2188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // alternation (OR) state save, but applying the fixup to it does no harm.) 2189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 2190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patIdx = fParenStack.popi(); 2191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (patIdx < 0) { 2192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // value < 0 flags the start of the frame on the paren stack. 2193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); 219650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx); 2197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. 2198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru patOp |= fRXPat->fCompiledPat->size(); // Set it now. 2199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(patOp, patIdx); 2200b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchOpenParen = patIdx; 2201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // At the close of any parenthesized block, restore the match mode flags to 2204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the value they had at the open paren. Saved value is 2205c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // at the top of the paren stack. 2206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fModeFlags = fParenStack.popi(); 2207b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fModeFlags < 0); 2208c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // DO any additional fixups, depending on the specific kind of 2210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // parentesized grouping this is 2211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (patIdx) { 2213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case plain: 2214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case flags: 2215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // No additional fixups required. 2216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (Grouping-only parentheses) 2217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case capturing: 2219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Capturing Parentheses. 2220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert a End Capture op into the pattern. 2221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The frame offset of the variables for this cg is obtained from the 2222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // start capture op and put it into the end-capture op. 2223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 222450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); 2226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t frameVarLocation = URX_VAL(captureOp); 22281b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_END_CAPTURE, frameVarLocation); 2229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case atomic: 2232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Atomic Parenthesis. 2233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert a LD_SP operation to restore the state stack to the position 2234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // it was when the atomic parens were entered. 2235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 223650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); 2238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stoLoc = URX_VAL(stoOp); 22391b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LD_SP, stoLoc); 2240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case lookAhead: 2244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 224550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2247b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = URX_VAL(startOp); 22481b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LA_END, dataLoc); 2249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case negLookAhead: 2253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See comment at doOpenLookAheadNeg 225550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); 2256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = URX_VAL(startOp); 22581b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LA_END, dataLoc); 22591b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKTRACK, 0); 22601b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LA_END, dataLoc); 2261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Patch the URX_SAVE near the top of the block. 2263c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The destination of the SAVE is the final LA_END that was just added. 226450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen); 2265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); 2266c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t dest = fRXPat->fCompiledPat->size()-1; 22671b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert saveOp = buildOp(URX_STATE_SAVE, dest); 2268b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); 2269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case lookBehind: 2273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See comment at doOpenLookBehind. 2275c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append the URX_LB_END and URX_LA_END to the compiled pattern. 227750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); 2278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = URX_VAL(startOp); 22801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LB_END, dataLoc); 22811b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LA_END, dataLoc); 2282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Determine the min and max bounds for the length of the 2284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // string that the pattern can match. 2285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // An unbounded upper limit is an error. 2286b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2288b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 22891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (URX_TYPE(maxML) != 0) { 22901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_LOOK_BEHIND_LIMIT); 22911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 22921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 2293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (maxML == INT32_MAX) { 2294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_LOOK_BEHIND_LIMIT); 2295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(minML <= maxML); 2298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert the min and max match len bounds into the URX_LB_CONT op that 2300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // appears at the top of the look-behind block, at location fMatchOpenParen+1 2301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); 2302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); 2303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2304b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2307b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case lookBehindN: 2310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // See comment at doOpenLookBehindNeg. 2312c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Append the URX_LBN_END to the compiled pattern. 231450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dataLoc = URX_VAL(startOp); 23171b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_LBN_END, dataLoc); 2318b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2319b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Determine the min and max bounds for the length of the 2320b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // string that the pattern can match. 2321b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // An unbounded upper limit is an error. 2322b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2323b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2324b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 23251b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (URX_TYPE(maxML) != 0) { 23261b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert error(U_REGEX_LOOK_BEHIND_LIMIT); 23271b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 23281b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 2329b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (maxML == INT32_MAX) { 2330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_LOOK_BEHIND_LIMIT); 2331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2333b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(minML <= maxML); 2334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert the min and max match len bounds into the URX_LB_CONT op that 2336b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // appears at the top of the look-behind block, at location fMatchOpenParen+1 2337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); 2338b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); 2339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2340b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Insert the pattern location to continue at after a successful match 2341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // as the last operand of the URX_LBN_CONT 23421b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); 2343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); 2344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 2350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 2351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2352b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // remember the next location in the compiled pattern. 2354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The compilation of Quantifiers will look at this to see whether its looping 2355b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // over a parenthesized block or a single item 2356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fMatchCloseParen = fRXPat->fCompiledPat->size(); 2357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2362b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// compileSet Compile the pattern operations for a reference to a 2364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// UnicodeSet. 2365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::compileSet(UnicodeSet *theSet) 2368b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2369b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (theSet == NULL) { 2370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2371b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2372c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Remove any strings from the set. 2373c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // There shoudn't be any, but just in case. 2374c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // (Case Closure can add them; if we had a simple case closure avaialble that 2375c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // ignored strings, that would be better.) 2376c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru theSet->removeAllStrings(); 2377b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t setSize = theSet->size(); 2378b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2379b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (setSize) { 2380c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case 0: 2381b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2382c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Set of no elements. Always fails to match. 23831b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_BACKTRACK, 0); 2384b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete theSet; 2385b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2387c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case 1: 2389b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2390b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The set contains only a single code point. Put it into 2391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the compiled pattern as a single char operation rather 2392b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // than a set, and discard the set itself. 239350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho literalChar(theSet->charAt(0)); 2394b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru delete theSet; 2395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2397c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2398c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru default: 2399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The set contains two or more chars. (the normal case) 2401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Put it into the compiled pattern as a set. 2402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t setNumber = fRXPat->fSets->size(); 2403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fSets->addElement(theSet, *fStatus); 24041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(URX_SETREF, setNumber); 2405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2406b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2410b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// compileInterval Generate the code for a {min, max} style interval quantifier. 2413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Except for the specific opcodes used, the code is the same 2414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// for all three types (greedy, non-greedy, possessive) of 2415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// intervals. The opcodes are supplied as parameters. 241659d709d503bab6e2b61931737e662dd293b40578ccornelius// (There are two sets of opcodes - greedy & possessive use the 241759d709d503bab6e2b61931737e662dd293b40578ccornelius// same ones, while non-greedy has it's own.) 2418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2419b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The code for interval loops has this form: 2420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 0 CTR_INIT counter loc (in stack frame) 2421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 1 5 patt address of CTR_LOOP at bottom of block 2422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2 min count 2423b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3 max count (-1 for unbounded) 2424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4 ... block to be iterated over 2425c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 5 CTR_LOOP 2426c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 2427c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// In 2428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) 2430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru{ 2431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The CTR_INIT op at the top of the block with the {n,m} quantifier takes 2432b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // four slots in the compiled code. Reserve them. 2433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topOfBlock = blockTopLoc(TRUE); 2434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topOfBlock); 2435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topOfBlock); 2436b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topOfBlock); 2437b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2438b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The operands for the CTR_INIT opcode include the index in the matcher data 243959d709d503bab6e2b61931737e662dd293b40578ccornelius // of the counter. Allocate it now. There are two data items 244059d709d503bab6e2b61931737e662dd293b40578ccornelius // counterLoc --> Loop counter 244159d709d503bab6e2b61931737e662dd293b40578ccornelius // +1 --> Input index (for breaking non-progressing loops) 244259d709d503bab6e2b61931737e662dd293b40578ccornelius // (Only present if unbounded upper limit on loop) 24431b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t dataSize = fIntervalUpper < 0 ? 2 : 1; 24441b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t counterLoc = allocateStackData(dataSize); 2445b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 24461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t op = buildOp(InitOp, counterLoc); 2447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topOfBlock); 2448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The second operand of CTR_INIT is the location following the end of the loop. 2450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the 2451b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // compilation of something later on causes the code to grow and the target 2452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // position to move. 2453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loopEnd = fRXPat->fCompiledPat->size(); 24541b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert op = buildOp(URX_RELOC_OPRND, loopEnd); 2455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); 2456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Followed by the min and max counts. 2458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); 2459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); 2460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. 2462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Goes at end of the block being looped over, so just append to the code so far. 24631b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(LoopOp, topOfBlock); 2464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if ((fIntervalLow & 0xff000000) != 0 || 246627f654740f2a26ad62a5c155af9199af9e69b889claireho (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { 2467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_NUMBER_TOO_BIG); 2468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { 2471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_MAX_LT_MIN); 2472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUBool RegexCompile::compileInlineInterval() { 2478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { 2479b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Too big to inline. Fail, which will cause looping code to be generated. 2480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (Upper < Lower picks up unbounded upper and errors, both.) 2481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t topOfBlock = blockTopLoc(FALSE); 2485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalUpper == 0) { 2486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Pathological case. Attempt no matches, as if the block doesn't exist. 24871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Discard the generated code for the block. 24881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // If the block included parens, discard the info pertaining to them as well. 2489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setSize(topOfBlock); 24901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (fMatchOpenParen >= topOfBlock) { 24911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fMatchOpenParen = -1; 24921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 24931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (fMatchCloseParen >= topOfBlock) { 24941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fMatchCloseParen = -1; 24951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 2496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { 2500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The thing being repeated is not a single op, but some 2501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // more complex block. Do it as a loop, not inlines. 2502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note that things "repeated" a max of once are handled as inline, because 2503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the one copy of the code already generated is just fine. 2504b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return FALSE; 2505b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2506b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Pick up the opcode that is to be repeated 2508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 250950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); 2510b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2511c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Compute the pattern location where the inline sequence 2512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // will end, and set up the state save op that will be needed. 2513c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 2514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 2515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru + fIntervalUpper + (fIntervalUpper-fIntervalLow); 25161b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc); 2517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fIntervalLow == 0) { 2518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru insertOp(topOfBlock); 2519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); 2520b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop, emitting the op for the thing being repeated each time. 2525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop starts at 1 because one instance of the op already exists in the pattern, 2526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // it was put there when it was originally encountered. 2527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t i; 2528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (i=1; i<fIntervalUpper; i++ ) { 25291b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (i >= fIntervalLow) { 25301b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(saveOp); 2531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 25321b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert appendOp(op); 2533b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return TRUE; 2535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 2536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2538b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2539b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2540b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2541f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// caseInsensitiveStart given a single code point from a pattern string, determine the 2542f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// set of characters that could potentially begin a case-insensitive 2543f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// match of a string beginning with that character, using full Unicode 2544f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// case insensitive matching. 2545f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// 2546f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// This is used in optimizing find(). 2547f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// 2548f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but 2549f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// misses cases like this: 2550f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// A string from the pattern begins with 'ss' (although all we know 2551f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// in this context is that it begins with 's') 2552f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// The pattern could match a string beginning with a German sharp-s 2553f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// 2554f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// To the ordinary case closure for a character c, we add all other 2555f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// characters cx where the case closure of cx incudes a string form that begins 2556f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// with the original character c. 2557f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// 2558f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// This function could be made smarter. The full pattern string is available 2559f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// and it would be possible to verify that the extra characters being added 2560f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// to the starting set fully match, rather than having just a first-char of the 2561f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// folded form match. 2562f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// 2563f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius//------------------------------------------------------------------------------ 2564f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) { 2565f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2566f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// Machine Generated below. 2567f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// It may need updating with new versions of Unicode. 2568f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed. 2569f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing 2570f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2571f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// Machine Generated Data. Do not hand edit. 2572f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius static const UChar32 RECaseFixCodePoints[] = { 2573f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 2574f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 2575f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 2576f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 2577f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000}; 2578f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2579f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius static const int16_t RECaseFixStringOffsets[] = { 2580f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 2581f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 2582f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 2583f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 2584f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0}; 2585f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2586f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius static const int16_t RECaseFixCounts[] = { 2587f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 2588f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 2589f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 2590f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 2591f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0}; 2592f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2593f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius static const UChar RECaseFixData[] = { 2594f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 2595f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 2596f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 2597f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 2598f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 2599f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 2600f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 2601f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 2602f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 2603f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 2604f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0}; 2605f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2606f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// End of machine generated data. 2607f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 260864339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) { 260964339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert // This function should never be called with an invalid input character. 261064339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert U_ASSERT(FALSE); 261164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert starterChars->clear(); 261264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2613f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT); 2614f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius starterChars->set(caseFoldedC, caseFoldedC); 2615f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2616f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t i; 2617f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (i=0; RECaseFixCodePoints[i]<c ; i++) { 2618f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Simple linear search through the sorted list of interesting code points. 2619f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 2620f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2621f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if (RECaseFixCodePoints[i] == c) { 2622f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t dataIndex = RECaseFixStringOffsets[i]; 2623f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t numCharsToAdd = RECaseFixCounts[i]; 2624f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UChar32 cpToAdd = 0; 2625f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius for (int32_t j=0; j<numCharsToAdd; j++) { 2626f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd); 2627f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius starterChars->add(cpToAdd); 2628f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 2629f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 2630f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2631f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius starterChars->closeOver(USET_CASE_INSENSITIVE); 2632f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius starterChars->removeAllStrings(); 2633f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } else { 2634f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Not a cased character. Just return it alone. 2635f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius starterChars->set(c, c); 2636f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 2637f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 2638f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2639f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2640f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2641f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 2642f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius//------------------------------------------------------------------------------ 2643f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius// 2644b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// matchStartType Determine how a match can start. 2645b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Used to optimize find() operations. 2646b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2647b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Operation is very similar to minMatchLength(). Walk the compiled 2648b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// pattern, keeping an on-going minimum-match-length. For any 2649b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// op where the min match coming in is zero, add that ops possible 2650b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// starting matches to the possible starts for the overall pattern. 2651b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 2652b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 2653b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::matchStartType() { 2654b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 2655b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 2656b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2657b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2658b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2659b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; // Location in the pattern of the current op being processed. 2660b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; // The op being processed 2661b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType; // The opcode type of the op 2662b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern 2663b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t numInitialStrings = 0; // Number of strings encountered that could match at start. 2664b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2665b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UBool atStart = TRUE; // True if no part of the pattern yet encountered 2666b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // could have advanced the position in a match. 2667b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (Maximum match length so far == 0) 2668b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2669b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forwardedLength is a vector holding minimum-match-length values that 2670b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2671b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // It must be one longer than the pattern being checked because some ops 2672b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // will jmp to a end-of-block+1 location from within a block, and we must 2673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // count those when checking the block. 2674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end = fRXPat->fCompiledPat->size(); 2675b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 forwardedLength(end+1, *fStatus); 2676b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setSize(end+1); 2677b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=3; loc<end; loc++) { 2678b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(INT32_MAX, loc); 2679b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc = 3; loc<end; loc++) { 268250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType = URX_TYPE(op); 2684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The loop is advancing linearly through the pattern. 2686b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the op we are now at was the destination of a branch in the pattern, 2687b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and that path has a shorter minimum length than the current accumulated value, 2688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // replace the current accumulated value. 2689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(loc) < currentLen) { 2690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc); 2691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (opType) { 2695b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that don't change the total length matched 2696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP: 2697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END: 2698c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_FAIL: 2699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_LEN: 2700b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_NOP: 2701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_START_CAPTURE: 2702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END_CAPTURE: 2703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_B: 2704b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_BU: 2705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_G: 2706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_Z: 2707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR: 2708c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_M: 2709c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_D: 2710c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_MD: 2711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RELOC_OPRND: 2712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_INP_LOC: 2713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF_I: 2715fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 2716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LD_SP: 2718b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2719c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET: 2721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (atStart) { 2722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_START; 2723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET_M: 2727c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_CARET_M_UNIX: 2728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (atStart) { 2729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_LINE; 2730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2732c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2733b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR: 2734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // This character could appear at the start of a match. 2736b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Add it to the set of possible starting characters. 2737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->add(URX_VAL(op)); 2738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2744c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2745c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_SETREF: 2746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2747b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t sn = URX_VAL(op); 2748b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2749b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2750b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(*s); 2751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_SR_I: 2758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // [Set]*, like a SETREF, above, in what it can match, 2759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // but may not match at all, so currentLen is not incremented. 2760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t sn = URX_VAL(op); 2762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(*s); 2765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2767b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_DOT_I: 2771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2772b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // .* at the start of a pattern. 2773b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Any character can begin the match. 2774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->clear(); 2775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->complement(); 2776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2782c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_STATIC_SETREF: 2783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t sn = URX_VAL(op); 2785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(sn>0 && sn<URX_LAST_SET); 2786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(*s); 2788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2796c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_STAT_SETREF_N: 2797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t sn = URX_VAL(op); 2799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet sc(*s); 2801b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru sc.complement(); 2802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(sc); 2803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2807b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2808b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2809b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2810b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2811b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_D: 2812b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Digit Char 2813b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2814c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet s; 2815b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 2816b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_VAL(op) != 0) { 2817b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru s.complement(); 2818b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2819b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(s); 2820b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2821b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2822b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 28271b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_H: 28281b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Horiz white space 28291b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (currentLen == 0) { 28301b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert UnicodeSet s; 28311b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus); 28321b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert s.add((UChar32)9); // Tab 28331b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (URX_VAL(op) != 0) { 28341b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert s.complement(); 28351b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 28361b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fRXPat->fInitialChars->addAll(s); 28371b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert numInitialStrings += 2; 28381b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 28391b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert currentLen++; 28401b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert atStart = FALSE; 28411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 28421b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 28431b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 28441b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_R: // Any line ending sequence 28451b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_V: // Any line ending code point, with optional negation 28461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (currentLen == 0) { 28471b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert UnicodeSet s; 28481b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert s.add((UChar32)0x0a, (UChar32)0x0d); // add range 28491b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert s.add((UChar32)0x85); 28501b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert s.add((UChar32)0x2028, (UChar32)0x2029); 28511b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (URX_VAL(op) != 0) { 28521b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Complement option applies to URX_BACKSLASH_V only. 28531b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert s.complement(); 28541b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 28551b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert fRXPat->fInitialChars->addAll(s); 28561b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert numInitialStrings += 2; 28571b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 28581b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert currentLen++; 28591b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert atStart = FALSE; 28601b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 28611b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 28621b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 28631b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 2864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR_I: 2865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Case Insensitive Single Character. 2866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2867b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = URX_VAL(op); 2868b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2869f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeSet starters(c, c); 2870f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius starters.closeOver(USET_CASE_INSENSITIVE); 2871f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // findCaseInsensitiveStarters(c, &starters); 2872f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // For ONECHAR_I, no need to worry about text chars that expand on folding into strings. 2873f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // The expanded folding can't match the pattern. 2874f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius fRXPat->fInitialChars->addAll(starters); 2875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 2876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Char has no case variants. Just add it as-is to the 2877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // set of possible starting chars. 2878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->add(c); 2879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY_ALL: // . matches one or two. 2889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY: 2890c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_UNIX: 2891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // These constructs are all bad news when they appear at the start 2893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of a match. Any character can begin the match. 2894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->clear(); 2895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->complement(); 2896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; 2897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 2899b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2900b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2901b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2902b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2903b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMPX: 2904b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. 29058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert U_FALLTHROUGH; 2906b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP: 2907b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2908b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 2909b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest < loc) { 2910b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop of some kind. Can safely ignore, the worst that will happen 2911b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is that we understate the true minimum length 2912b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 2913c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2914b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 2915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Forward jump. Propagate the current min length to the target loc of the jump. 2916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(jmpDest <= end+1); 2917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(jmpDest) > currentLen) { 2918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 2919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2920b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2921b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV: 2926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV_X: 2927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Combo of state save to the next loc, + jmp backwards. 2928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Net effect on min. length computation is nothing. 2929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2932c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_BACKTRACK: 2933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Fails are kind of like a branch, except that the min length was 2934b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated already, by the state save. 2935b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 2936b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2938b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2939b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2940b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATE_SAVE: 2941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2942b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // State Save, for forward jumps, propagate the current minimum. 2943b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the state save. 2944b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 2945b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 2946b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen < forwardedLength.elementAti(jmpDest)) { 2947b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 2948b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2949c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 2950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2953c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING: 2958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 296050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringLen = URX_VAL(stringLenOp); 2962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(stringLenOp >= 2); 2964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Add the starting character of this string to the set of possible starting 2966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // characters for this pattern. 2967b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringStartIdx = URX_VAL(op); 2968b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2969b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->add(c); 2970b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2971b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Remember this string. After the entire pattern has been checked, 2972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // if nothing else is identified that can start a match, we'll use it. 2973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings++; 2974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialStringIdx = stringStartIdx; 2975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialStringLen = stringLen; 2976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2977c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 2978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen += stringLen; 2979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 2980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 2981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 2982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 2983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_I: 2984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 2985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Case-insensitive string. Unlike exact-match strings, we won't 2986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // attempt a string search for possible match positions. But we 2987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // do update the set of possible starting characters. 2988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 298950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringLen = URX_VAL(stringLenOp); 2991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(stringLenOp >= 2); 2993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == 0) { 2994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Add the starting character of this string to the set of possible starting 2995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // characters for this pattern. 2996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t stringStartIdx = URX_VAL(op); 2997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2998f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeSet s; 2999f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius findCaseInsensitiveStarters(c, &s); 3000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars->addAll(s); 3001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru numInitialStrings += 2; // Matching on an initial string not possible. 3002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen += stringLen; 3004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 3005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT: 3009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT_NG: 3010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop Init Ops. These don't change the min length, but they are 4 word ops 3012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // so location must be updated accordingly. 3013c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Loop Init Ops. 3014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the min loop count == 0 3015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // move loc forwards to the end of the loop, skipping over the body. 3016c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If the min count is > 0, 3017b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // continue normal processing of the body of the loop. 301850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 3019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopEndLoc = URX_VAL(loopEndLoc); 302050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 3021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (minLoopCount == 0) { 3022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Min Loop Count of 0, treat like a forward branch and 3023b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // move the current minimum length up to the target 3024b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // (end of loop) location. 3025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(loopEndLoc <= end+1); 3026b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(loopEndLoc) > currentLen) { 3027b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, loopEndLoc); 3028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3029c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc+=3; // Skips over operands of CTR_INIT 3031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 3033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP: 3037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP_NG: 3038c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Loop ops. 3039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The jump is conditional, backwards only. 3040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 3041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3042c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_C: 3044b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // More loop ops. These state-save to themselves. 3045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // don't change the minimum match 3046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru atStart = FALSE; 3047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3048c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_START: 3051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_START: 3052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-around. Scan forward until the matching look-ahead end, 3054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // without processing the look-around block. This is overly pessimistic. 3055fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 3056c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Keep track of the nesting depth of look-around blocks. Boilerplate code for 3057c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // lookahead contains two LA_END instructions, so count goes up by two 3058c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // for each LA_START. 3059c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t depth = (opType == URX_LA_START? 2: 1); 3060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 306250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3063c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_START) { 3064c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru depth+=2; 3065c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3066c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LB_START) { 3067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth++; 3068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 3070c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru depth--; 3071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (depth == 0) { 3072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3075b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_STATE_SAVE) { 3076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Need this because neg lookahead blocks will FAIL to outside 3077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the block. 3078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 3080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen < forwardedLength.elementAti(jmpDest)) { 3081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3084b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3085c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(loc <= end); 3086b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3087b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3088b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3089c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3090b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_END: 3091b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_CONT: 3092b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_END: 3093b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_CONT: 3094b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_END: 3095c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(FALSE); // Shouldn't get here. These ops should be 3096b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // consumed by the scan in URX_LA_START and LB_START 3097b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3098b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3099c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3100b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 3101b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 3102b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3103c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3104b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3105b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3106b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3107b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We have finished walking through the ops. Check whether some forward jump 3108b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated a shorter length to location end+1. 3109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(end+1) < currentLen) { 3110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(end+1); 3111b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3112b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3113b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3114b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChars8->init(fRXPat->fInitialChars); 3115b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3116b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3117b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Sort out what we should check for when looking for candidate match start positions. 3118b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // In order of preference, 3119b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 1. Start of input text buffer. 3120b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 2. A literal string. 3121b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3. Start of line in multi-line mode. 3122b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4. A single literal character. 3123b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 5. A character from a set of characters. 3124b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3125b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fRXPat->fStartType == START_START) { 3126b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Match only at the start of an input text string. 3127b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // start type is already set. We're done. 3128b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { 3129b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Match beginning only with a literal string. 3130b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); 3131b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fRXPat->fInitialChars->contains(c)); 3132b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_STRING; 3133b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChar = c; 3134b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (fRXPat->fStartType == START_LINE) { 3135b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Match at start of line in Multi-Line mode. 3136b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Nothing to do here; everything is already set. 3137b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (fRXPat->fMinMatchLen == 0) { 3138b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Zero length match possible. We could start anywhere. 3139b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_NO_INFO; 3140b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (fRXPat->fInitialChars->size() == 1) { 3141b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // All matches begin with the same char. 3142b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_CHAR; 3143b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); 3144b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); 3145b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE && 3146b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fMinMatchLen > 0) { 3147b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Matches start with a set of character smaller than the set of all chars. 3148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_SET; 3149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 3150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Matches can start with anything 3151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fStartType = START_NO_INFO; 3152b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// minMatchLength Calculate the length of the shortest string that could 3162c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// match the specified pattern. 3163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Length is in 16 bit code units, not code points. 3164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The calculated length may not be exact. The returned 3166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// value may be shorter than the actual minimum; it must 3167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// never be longer. 3168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// start and end are the range of p-code operations to be 3170b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// examined. The endpoints are included in the range. 3171b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3172b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3173b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { 3174b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 3175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 3176b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(start <= end); 3179b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(end < fRXPat->fCompiledPat->size()); 3180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3181b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; 3183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; 3184b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType; 3185b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t currentLen = 0; 3186b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3187b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // forwardedLength is a vector holding minimum-match-length values that 3189b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are propagated forward in the pattern by JMP or STATE_SAVE operations. 3190b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // It must be one longer than the pattern being checked because some ops 3191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // will jmp to a end-of-block+1 location from within a block, and we must 3192b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // count those when checking the block. 3193b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 forwardedLength(end+2, *fStatus); 3194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setSize(end+2); 3195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=start; loc<=end+1; loc++) { 3196b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(INT32_MAX, loc); 3197b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3198b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3199b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc = start; loc<=end; loc++) { 320050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType = URX_TYPE(op); 3202b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3203b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The loop is advancing linearly through the pattern. 3204b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the op we are now at was the destination of a branch in the pattern, 3205b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and that path has a shorter minimum length than the current accumulated value, 3206b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // replace the current accumulated value. 3207c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some 3208c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // no-match-possible cases. 3209b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(loc) < currentLen) { 3210b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc); 3211b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3212b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3214b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (opType) { 3215b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that don't change the total length matched 3216b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP: 3217b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END: 3218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_LEN: 3219b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_NOP: 3220b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_START_CAPTURE: 3221b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END_CAPTURE: 3222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_B: 3223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_BU: 3224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_G: 3225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_Z: 3226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET: 3227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR: 3228c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_M: 3229c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_D: 3230c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_MD: 3231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RELOC_OPRND: 3232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_INP_LOC: 3233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET_M: 3234c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_CARET_M_UNIX: 3235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF_I: 3237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LD_SP: 3240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3241b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV: 3242b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV_X: 3243b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3244c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3245b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3246b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that match a minimum of one character (one or two 16 bit code units.) 3247c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3248b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR: 3249b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATIC_SETREF: 3250b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STAT_SETREF_N: 3251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_SETREF: 3252b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_D: 32531b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_H: 32541b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_R: 32551b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_V: 3256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR_I: 3257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 3258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY_ALL: // . matches one or two. 3259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY: 3260c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_UNIX: 3261b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen++; 3262b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3263b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3264b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3265b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMPX: 3266b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; // URX_JMPX has an extra operand, ignored here, 3267b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // otherwise processed identically to URX_JMP. 32688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert U_FALLTHROUGH; 3269b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP: 3270b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3271b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3272b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest < loc) { 3273b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop of some kind. Can safely ignore, the worst that will happen 3274b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // is that we understate the true minimum length 3275b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 3276b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 3277b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Forward jump. Propagate the current min length to the target loc of the jump. 3278b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(jmpDest <= end+1); 3279b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(jmpDest) > currentLen) { 3280b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3281b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3282b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3283b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3284b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3285b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3286c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_BACKTRACK: 3287b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3288c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Back-tracks are kind of like a branch, except that the min length was 3289b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated already, by the state save. 3290b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 3291b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3294b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3295b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATE_SAVE: 3296b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3297b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // State Save, for forward jumps, propagate the current minimum. 3298b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the state save. 3299b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3300b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 3301b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen < forwardedLength.elementAti(jmpDest)) { 3302b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3303b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3304c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3305b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3306b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3307c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3308b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3309b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING: 3310b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3311b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 331250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3313b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen += URX_VAL(stringLenOp); 3314b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3315b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3316b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3317b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3318103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius case URX_STRING_I: 3319103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius { 3320103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius loc++; 3321103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // TODO: with full case folding, matching input text may be shorter than 3322103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // the string we have here. More smarts could put some bounds on it. 3323103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Assume a min length of one for now. A min length of zero causes 3324103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // optimization failures for a pattern like "string"+ 3325103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // currentLen += URX_VAL(stringLenOp); 3326103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius currentLen += 1; 3327103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 3328103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius break; 3329103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 3330b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT: 3331b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT_NG: 3332b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3333c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Loop Init Ops. 3334b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the min loop count == 0 3335b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // move loc forwards to the end of the loop, skipping over the body. 3336c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // If the min count is > 0, 3337b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // continue normal processing of the body of the loop. 333850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 3339b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loopEndLoc = URX_VAL(loopEndLoc); 334050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 3341b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (minLoopCount == 0) { 3342b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc = loopEndLoc; 3343b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 3344b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc+=3; // Skips over operands of CTR_INIT 3345b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3346b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3347b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3348b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3349b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3350b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP: 3351b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP_NG: 3352c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Loop ops. 3353b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The jump is conditional, backwards only. 3354b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3355c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3356b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_SR_I: 3357b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_DOT_I: 3358b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_C: 3359b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // More loop ops. These state-save to themselves. 3360b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // don't change the minimum match - could match nothing at all. 3361b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3362c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3363b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3364b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_START: 3365b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_START: 3366b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3367b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-around. Scan forward until the matching look-ahead end, 3368c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // without processing the look-around block. This is overly pessimistic for look-ahead, 3369c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // it assumes that the look-ahead match might be zero-length. 3370b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // TODO: Positive lookahead could recursively do the block, then continue 3371c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // with the longer of the block or the value coming in. Ticket 6060 3372c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t depth = (opType == URX_LA_START? 2: 1);; 3373b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3374b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 337550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3376c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_START) { 3377c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The boilerplate for look-ahead includes two LA_END insturctions, 3378c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Depth will be decremented by each one when it is seen. 3379c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru depth += 2; 3380c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3381c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LB_START) { 3382b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth++; 3383b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3384c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_END) { 3385c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru depth--; 3386b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (depth == 0) { 3387b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3388b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3389c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3390c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (URX_TYPE(op)==URX_LBN_END) { 3391b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth--; 3392c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (depth == 0) { 3393c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 3394c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 3395b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3396b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_STATE_SAVE) { 3397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Need this because neg lookahead blocks will FAIL to outside 3398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the block. 3399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3400b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 3401b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen < forwardedLength.elementAti(jmpDest)) { 3402b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3403b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3404b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3405b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3406c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(loc <= end); 3407b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3408b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3409b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3410c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3411b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_END: 3412b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_CONT: 3413b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_END: 3414b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_CONT: 3415b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_END: 3416b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Only come here if the matching URX_LA_START or URX_LB_START was not in the 3417b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // range being sized, which happens when measuring size of look-behind blocks. 3418b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3419c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3420b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 3421b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 3422b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3423c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3424b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3425b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3426b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We have finished walking through the ops. Check whether some forward jump 3427b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated a shorter length to location end+1. 3428b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(end+1) < currentLen) { 3429b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(end+1); 3430b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3431b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3432c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3433b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return currentLen; 3434b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3435b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3436103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// Increment with overflow check. 3437103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius// val and delta will both be positive. 3438103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 3439103e9ffba2cba345d0078eb8b8db33249f81840aCraig Corneliusstatic int32_t safeIncrement(int32_t val, int32_t delta) { 3440103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if (INT32_MAX - val > delta) { 3441103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return val + delta; 3442103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } else { 3443103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius return INT32_MAX; 3444103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 3445103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius} 3446b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3447b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3448b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3449b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3450b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// maxMatchLength Calculate the length of the longest string that could 3451c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// match the specified pattern. 3452b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Length is in 16 bit code units, not code points. 3453b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3454b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The calculated length may not be exact. The returned 3455b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// value may be longer than the actual maximum; it must 3456b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// never be shorter. 3457b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3458b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3459b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruint32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { 3460b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 3461b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return 0; 3462b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3463b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(start <= end); 3464b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(end < fRXPat->fCompiledPat->size()); 3465b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3466b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3467b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; 3468b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t op; 3469b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType; 3470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t currentLen = 0; 3471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 forwardedLength(end+1, *fStatus); 3472b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setSize(end+1); 3473b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3474b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=start; loc<=end; loc++) { 3475b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(0, loc); 3476b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3477b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3478b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc = start; loc<=end; loc++) { 347950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3480b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru opType = URX_TYPE(op); 3481b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3482b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The loop is advancing linearly through the pattern. 3483b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // If the op we are now at was the destination of a branch in the pattern, 3484b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // and that path has a longer maximum length than the current accumulated value, 3485b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // replace the current accumulated value. 3486b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(loc) > currentLen) { 3487b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc); 3488b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3489b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3490b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (opType) { 3491b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that don't change the total length matched 3492b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP: 3493b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END: 3494b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_LEN: 3495b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_NOP: 3496b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_START_CAPTURE: 3497b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END_CAPTURE: 3498b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_B: 3499b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_BU: 3500b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_G: 3501b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_Z: 3502b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET: 3503b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR: 3504c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_M: 3505c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_D: 3506c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_MD: 3507b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RELOC_OPRND: 3508b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_INP_LOC: 3509b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET_M: 3510c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_CARET_M_UNIX: 3511b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3512b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3513b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LD_SP: 3514b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3515b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_END: 3516b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_CONT: 3517b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_CONT: 3518b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_END: 3519b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3520c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3521b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3522b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that increase that cause an unbounded increase in the length 3523b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of a matched string, or that increase it a hard to characterize way. 3524b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Call the max length unbounded, and stop further checking. 3525b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3526b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKREF_I: 3527b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 3528b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = INT32_MAX; 3529b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3530b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3531b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3532b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Ops that match a max of one character (possibly two 16 bit code units.) 3533c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 3534b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATIC_SETREF: 3535b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STAT_SETREF_N: 3536b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_SETREF: 3537b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_D: 35381b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_H: 35391b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_R: 35401b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_V: 3541b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR_I: 3542c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_ALL: 3543b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY: 3544c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_UNIX: 3545103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius currentLen = safeIncrement(currentLen, 2); 3546b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3547b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3548b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Single literal character. Increase current max length by one or two, 3549b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // depending on whether the char is in the supplementary range. 3550b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR: 3551103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius currentLen = safeIncrement(currentLen, 1); 3552b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_VAL(op) > 0x10000) { 3553103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius currentLen = safeIncrement(currentLen, 1); 3554b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3555b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3556b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3557c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Jumps. 3558b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 3559b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP: 3560b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMPX: 3561b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV: 3562b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV_X: 3563b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3564b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3565b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest < loc) { 3566b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Loop of some kind. Max match length is unbounded. 3567b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = INT32_MAX; 3568b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 3569b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Forward jump. Propagate the current min length to the target loc of the jump. 3570b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (forwardedLength.elementAti(jmpDest) < currentLen) { 3571b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3572b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = 0; 3574b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3575b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3576b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3578c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_BACKTRACK: 3579c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // back-tracks are kind of like a branch, except that the max length was 3580b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // propagated already, by the state save. 3581b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = forwardedLength.elementAti(loc+1); 3582b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3583b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3584b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3585b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATE_SAVE: 3586b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3587b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // State Save, for forward jumps, propagate the current minimum. 3588b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // of the state save. 3589b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // For backwards jumps, they create a loop, maximum 3590b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // match length is unbounded. 3591b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t jmpDest = URX_VAL(op); 3592b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (jmpDest > loc) { 3593b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen > forwardedLength.elementAti(jmpDest)) { 3594b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru forwardedLength.setElementAt(currentLen, jmpDest); 3595b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3596b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } else { 3597b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = INT32_MAX; 3598b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3599b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3600b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3601c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3602b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3603b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3604b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3605b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING: 3606103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius { 3607103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius loc++; 3608103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3609103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3610103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius break; 3611103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius } 3612103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius 3613b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_I: 3614103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // TODO: This code assumes that any user string that matches will be no longer 3615103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // than our compiled string, with case insensitive matching. 3616103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Our compiled string has been case-folded already. 3617103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // 3618103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // Any matching user string will have no more code points than our 3619103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // compiled (folded) string. Folding may add code points, but 3620103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // not remove them. 3621103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // 3622fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // There is a potential problem if a supplemental code point 3623103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // case-folds to a BMP code point. In this case our compiled string 3624103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // could be shorter (in code units) than a matching user string. 3625103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // 3626103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // At this time (Unicode 6.1) there are no such characters, and this case 3627103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // is not being handled. A test, intltest regex/Bug9283, will fail if 3628103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // any problematic characters are added to Unicode. 3629103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // 3630103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // If this happens, we can make a set of the BMP chars that the 3631103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // troublesome supplementals fold to, scan our string, and bump the 3632103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // currentLen one extra for each that is found. 3633103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius // 3634b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3635b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 363650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3637103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT: 3642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT_NG: 364359d709d503bab6e2b61931737e662dd293b40578ccornelius // For Loops, recursively call this function on the pattern for the loop body, 364459d709d503bab6e2b61931737e662dd293b40578ccornelius // then multiply the result by the maximum loop count. 364559d709d503bab6e2b61931737e662dd293b40578ccornelius { 364659d709d503bab6e2b61931737e662dd293b40578ccornelius int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1)); 364759d709d503bab6e2b61931737e662dd293b40578ccornelius if (loopEndLoc == loc+4) { 364859d709d503bab6e2b61931737e662dd293b40578ccornelius // Loop has an empty body. No affect on max match length. 364959d709d503bab6e2b61931737e662dd293b40578ccornelius // Continue processing with code after the loop end. 365059d709d503bab6e2b61931737e662dd293b40578ccornelius loc = loopEndLoc; 365159d709d503bab6e2b61931737e662dd293b40578ccornelius break; 365259d709d503bab6e2b61931737e662dd293b40578ccornelius } 3653fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 36541b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3)); 365559d709d503bab6e2b61931737e662dd293b40578ccornelius if (maxLoopCount == -1) { 365659d709d503bab6e2b61931737e662dd293b40578ccornelius // Unbounded Loop. No upper bound on match length. 365759d709d503bab6e2b61931737e662dd293b40578ccornelius currentLen = INT32_MAX; 365859d709d503bab6e2b61931737e662dd293b40578ccornelius break; 365959d709d503bab6e2b61931737e662dd293b40578ccornelius } 366059d709d503bab6e2b61931737e662dd293b40578ccornelius 366159d709d503bab6e2b61931737e662dd293b40578ccornelius U_ASSERT(loopEndLoc >= loc+4); 36621b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call. 36631b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount; 36641b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if (updatedLen >= INT32_MAX) { 36651b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert currentLen = INT32_MAX; 366659d709d503bab6e2b61931737e662dd293b40578ccornelius break; 366759d709d503bab6e2b61931737e662dd293b40578ccornelius } 36681b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert currentLen = (int32_t)updatedLen; 366959d709d503bab6e2b61931737e662dd293b40578ccornelius loc = loopEndLoc; 367059d709d503bab6e2b61931737e662dd293b40578ccornelius break; 367159d709d503bab6e2b61931737e662dd293b40578ccornelius } 367259d709d503bab6e2b61931737e662dd293b40578ccornelius 3673b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP: 3674b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP_NG: 367559d709d503bab6e2b61931737e662dd293b40578ccornelius // These opcodes will be skipped over by code for URX_CRT_INIT. 367659d709d503bab6e2b61931737e662dd293b40578ccornelius // We shouldn't encounter them here. 367759d709d503bab6e2b61931737e662dd293b40578ccornelius U_ASSERT(FALSE); 367859d709d503bab6e2b61931737e662dd293b40578ccornelius break; 367959d709d503bab6e2b61931737e662dd293b40578ccornelius 3680b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_SR_I: 3681b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_DOT_I: 3682b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_C: 3683b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // For anything to do with loops, make the match length unbounded. 3684b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru currentLen = INT32_MAX; 3685b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3686c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3687c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3688b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3689b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_START: 3690b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_END: 3691b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-ahead. Just ignore, treat the look-ahead block as if 3692b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // it were normal pattern. Gives a too-long match length, 3693b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // but good enough for now. 3694b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3695c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3696b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // End of look-ahead ops should always be consumed by the processing at 3697b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the URX_LA_START op. 3698b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // U_ASSERT(FALSE); 3699b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // break; 3700c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3701b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_START: 3702b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3703b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Look-behind. Scan forward until the matching look-around end, 3704c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // without processing the look-behind block. 3705b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t depth = 0; 3706b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 3707b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru loc++; 370850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3709b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { 3710b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth++; 3711b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3712b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 3713b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (depth == 0) { 3714b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3715b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3716b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru depth--; 3717b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3718c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(loc < end); 3719b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3720b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3721b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3722b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3723b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 3724b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 3725b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3726b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3727c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3728b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (currentLen == INT32_MAX) { 3729b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The maximum length is unbounded. 3730b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Stop further processing of the pattern. 3731b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3732b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3733c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3734b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3735b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return currentLen; 3736c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 3737b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3738b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3739b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3740b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3741b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3742b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// stripNOPs Remove any NOP operations from the compiled pattern code. 3743b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Extra NOPs are inserted for some constructs during the initial 3744b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// code generation to provide locations that may be patched later. 3745b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Many end up unneeded, and are removed by this function. 3746b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 374750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// In order to minimize the number of passes through the pattern, 374850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// back-reference fixup is also performed here (adjusting 374950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// back-reference operands to point to the correct frame offsets). 375050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// 3751b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3752b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::stripNOPs() { 3753b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3754b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 3755b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return; 3756b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3757b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3758b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t end = fRXPat->fCompiledPat->size(); 3759b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UVector32 deltas(end, *fStatus); 3760b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3761b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Make a first pass over the code, computing the amount that things 3762b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // will be offset at each location in the original code. 3763b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t loc; 3764b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t d = 0; 3765b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (loc=0; loc<end; loc++) { 3766b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru deltas.addElement(d, *fStatus); 376750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3768b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (URX_TYPE(op) == URX_NOP) { 3769b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru d++; 3770b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3771b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3772fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 377350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString caseStringBuffer; 3774b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3775b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Make a second pass over the code, removing the NOPs by moving following 3776b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // code up, and patching operands that refer to code locations that 3777b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // are being moved. The array of offsets from the first step is used 3778b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // to compute the new operand values. 3779b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t src; 3780b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t dst = 0; 3781b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (src=0; src<end; src++) { 378250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); 3783b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t opType = URX_TYPE(op); 3784b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru switch (opType) { 3785b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_NOP: 3786b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3787b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3788b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATE_SAVE: 3789b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP: 3790b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP: 3791b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_LOOP_NG: 3792b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RELOC_OPRND: 3793b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMPX: 3794b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV: 3795b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_JMP_SAV_X: 3796b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // These are instructions with operands that refer to code locations. 3797b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 3798b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t operandAddress = URX_VAL(op); 3799b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); 3800b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress); 38011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert op = buildOp(opType, fixedOperandAddress); 3802b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, dst); 3803b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru dst++; 3804b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3805b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3806b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 380750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case URX_BACKREF: 380850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case URX_BACKREF_I: 380950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { 381050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t where = URX_VAL(op); 381150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (where > fRXPat->fGroupMap->size()) { 381250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho error(U_REGEX_INVALID_BACK_REF); 381350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 381450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 381550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho where = fRXPat->fGroupMap->elementAti(where-1); 38161b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert op = buildOp(opType, where); 381750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fCompiledPat->setElementAt(op, dst); 381850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dst++; 3819fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 382050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fRXPat->fNeedsAltInput = TRUE; 382150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 382250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 3823b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP: 3824b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_RESERVED_OP_N: 3825b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKTRACK: 3826b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END: 3827b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_ONECHAR: 3828103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius case URX_STRING: 3829b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STRING_LEN: 3830b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_START_CAPTURE: 3831b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_END_CAPTURE: 3832b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STATIC_SETREF: 3833b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STAT_SETREF_N: 3834b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_SETREF: 3835b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY: 3836b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_FAIL: 3837b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_B: 3838b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_BU: 3839b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_G: 3840b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_X: 3841b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_Z: 3842b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOTANY_ALL: 3843b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_BACKSLASH_D: 3844b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET: 3845b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR: 3846b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT: 3847b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CTR_INIT_NG: 3848c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOTANY_UNIX: 3849b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_SP: 3850b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LD_SP: 3851b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_STO_INP_LOC: 3852b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_START: 3853b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LA_END: 3854103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius case URX_ONECHAR_I: 3855103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius case URX_STRING_I: 3856b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_DOLLAR_M: 3857b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_CARET_M: 3858c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_CARET_M_UNIX: 3859b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_START: 3860b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_CONT: 3861b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LB_END: 3862b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_CONT: 3863b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LBN_END: 3864b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_SR_I: 3865b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_DOT_I: 3866b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru case URX_LOOP_C: 3867c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_D: 3868c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case URX_DOLLAR_MD: 38691b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_H: 38701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_R: 38711b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert case URX_BACKSLASH_V: 3872b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // These instructions are unaltered by the relocation. 3873b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setElementAt(op, dst); 3874b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru dst++; 3875b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 3876b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3877b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru default: 3878b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Some op is unaccounted for. 3879b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru U_ASSERT(FALSE); 3880b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_INTERNAL_ERROR); 3881b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3882b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3883b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3884b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fRXPat->fCompiledPat->setSize(dst); 3885b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3886b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3887b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3888b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3889b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3890b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3891b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3892b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Error Report a rule parse error. 3893b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Only report it if no previous error has been recorded. 3894b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3895b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3896b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::error(UErrorCode e) { 3897b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_SUCCESS(*fStatus)) { 3898b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru *fStatus = e; 389950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public 390050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are 390150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // int64_t. If the values of the latter are out of range for the former, 390250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // set them to the appropriate "field not supported" values. 390350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (fLineNum > 0x7FFFFFFF) { 390450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->line = 0; 390550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->offset = -1; 390650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if (fCharNum > 0x7FFFFFFF) { 390750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->line = (int32_t)fLineNum; 390850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->offset = -1; 390950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 391050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->line = (int32_t)fLineNum; 391150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fParseErr->offset = (int32_t)fCharNum; 391250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 3913fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 391450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context 3915b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3916b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Fill in the context. 3917b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Note: extractBetween() pins supplied indicies to the string bounds. 3918b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); 3919b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); 392050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); 392150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); 3922b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3923b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3924b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3925b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3926b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3927b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Assorted Unicode character constants. 3928b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Numeric because there is no portable way to enter them as literals. 3929b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// (Think EBCDIC). 3930b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3931b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic const UChar chCR = 0x0d; // New lines, for terminating comments. 3932c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chLF = 0x0a; // Line Feed 3933b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic const UChar chPound = 0x23; // '#', introduces a comment. 3934c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chDigit0 = 0x30; // '0' 3935c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chDigit7 = 0x37; // '9' 3936c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chColon = 0x3A; // ':' 3937b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic const UChar chE = 0x45; // 'E' 3938c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chQ = 0x51; // 'Q' 393954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius//static const UChar chN = 0x4E; // 'N' 3940c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chP = 0x50; // 'P' 3941b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Querustatic const UChar chBackSlash = 0x5c; // '\' introduces a char escape 394254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius//static const UChar chLBracket = 0x5b; // '[' 3943c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chRBracket = 0x5d; // ']' 3944c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chUp = 0x5e; // '^' 3945c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chLowerP = 0x70; 3946c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chLBrace = 0x7b; // '{' 3947c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chRBrace = 0x7d; // '}' 3948c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chNEL = 0x85; // NEL newline variant 3949c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar chLS = 0x2028; // Unicode Line Separator 3950b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3951b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3952b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3953b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3954b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// nextCharLL Low Level Next Char from the regex pattern. 3955b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Get a char from the string, keep track of input position 3956b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// for error reporting. 3957b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3958b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3959b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUChar32 RegexCompile::nextCharLL() { 3960b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UChar32 ch; 3961b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3962b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fPeekChar != -1) { 3963b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ch = fPeekChar; 3964b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPeekChar = -1; 3965b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return ch; 3966b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3967fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 396850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // assume we're already in the right place 396950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ch = UTEXT_NEXT32(fRXPat->fPattern); 397050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (ch == U_SENTINEL) { 397150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return ch; 3972b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3973b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3974b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ch == chCR || 3975b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ch == chNEL || 3976b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru ch == chLS || 397727f654740f2a26ad62a5c155af9199af9e69b889claireho (ch == chLF && fLastChar != chCR)) { 3978b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Character is starting a new line. Bump up the line number, and 3979b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // reset the column to 0. 3980b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fLineNum++; 3981b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fCharNum=0; 3982b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3983b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else { 3984b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Character is not starting a new line. Except in the case of a 3985b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // LF following a CR, increment the column position. 3986b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (ch != chLF) { 3987b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fCharNum++; 3988b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3989b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 3990b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fLastChar = ch; 3991b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return ch; 3992b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 3993b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 3994b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 3995b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3996b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// peekCharLL Low Level Character Scanning, sneak a peek at the next 3997b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// character without actually getting it. 3998b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 3999b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 4000b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUChar32 RegexCompile::peekCharLL() { 4001b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fPeekChar == -1) { 4002b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fPeekChar = nextCharLL(); 4003b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4004b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return fPeekChar; 4005b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4006b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4007b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4008b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 4009b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4010b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// nextChar for pattern scanning. At this level, we handle stripping 4011b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// out comments and processing some backslash character escapes. 4012b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The rest of the pattern grammar is handled at the next level up. 4013b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4014b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 4015b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queruvoid RegexCompile::nextChar(RegexPatternChar &c) { 4016b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 401750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 4018b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar = nextCharLL(); 4019b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fQuoted = FALSE; 4020b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4021b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fQuoteMode) { 4022b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fQuoted = TRUE; 4023fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) || 4024103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius c.fChar == (UChar32)-1) { 4025b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fQuoteMode = FALSE; // Exit quote mode, 4026103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius nextCharLL(); // discard the E 4027103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius nextChar(c); // recurse to get the real next char 4028b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4029b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4030b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else if (fInBackslashQuote) { 4031b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // The current character immediately follows a '\' 4032b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Don't check for any further escapes, just return it as-is. 4033b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Don't set c.fQuoted, because that would prevent the state machine from 4034b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // dispatching on the character. 4035b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fInBackslashQuote = FALSE; 4036b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4037b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else 4038b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4039b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We are not in a \Q quoted region \E of the source. 4040b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4041b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_COMMENTS) { 4042b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4043b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We are in free-spacing and comments mode. 4044c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scan through any white space and comments, until we 4045b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // reach a significant character or the end of inut. 4046b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4047b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c.fChar == (UChar32)-1) { 4048b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; // End of Input 4049b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4050b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c.fChar == chPound && fEOLComments == TRUE) { 4051b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Start of a comment. Consume the rest of it, until EOF or a new line 4052b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4053b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar = nextCharLL(); 4054b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c.fChar == (UChar32)-1 || // EOF 4055b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar == chCR || 4056b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar == chLF || 4057b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar == chNEL || 4058b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar == chLS) { 4059b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4060b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4061b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4062b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4063c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. 4064b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (PatternProps::isWhiteSpace(c.fChar) == FALSE) { 4065b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4066b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4067b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fChar = nextCharLL(); 4068b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4069b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4070b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4071b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4072b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // check for backslash escaped characters. 4073b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4074b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (c.fChar == chBackSlash) { 407550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 4076b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { 4077b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4078b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // A '\' sequence that is handled by ICU's standard unescapeAt function. 4079b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Includes \uxxxx, \n, \r, many others. 4080b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Return the single equivalent character. 4081b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // 4082b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru nextCharLL(); // get & discard the peeked char. 4083b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru c.fQuoted = TRUE; 4084fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 408550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { 408650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t endIndex = (int32_t)pos; 408750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); 4088fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 408950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (endIndex == pos) { 409050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho error(U_REGEX_BAD_ESCAPE_SEQUENCE); 409150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 409250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fCharNum += endIndex - pos; 409350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); 409450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 409550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t offset = 0; 409650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); 4097fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 409850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); 409950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 410050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 410150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if (offset == 0) { 410250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho error(U_REGEX_BAD_ESCAPE_SEQUENCE); 410350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if (context.lastOffset == offset) { 410450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTEXT_PREVIOUS32(fRXPat->fPattern); 410550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if (context.lastOffset != offset-1) { 410650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); 410750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 410850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fCharNum += offset; 4109b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4110b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4111c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (peekCharLL() == chDigit0) { 4112c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Octal Escape, using Java Regexp Conventions 4113c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // which are \0 followed by 1-3 octal digits. 4114c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Different from ICU Unescape handling of Octal, which does not 4115c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // require the leading 0. 411650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Java also has the convention of only consuming 2 octal digits if 4117c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // the three digit number would be > 0xff 4118c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4119c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c.fChar = 0; 4120c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextCharLL(); // Consume the initial 0. 4121c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int index; 4122c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (index=0; index<3; index++) { 4123c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t ch = peekCharLL(); 4124c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (ch<chDigit0 || ch>chDigit7) { 4125c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (index==0) { 4126c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // \0 is not followed by any octal digits. 4127c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_BAD_ESCAPE_SEQUENCE); 4128c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4129c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4130c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4131c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c.fChar <<= 3; 4132c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c.fChar += ch&7; 4133c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (c.fChar <= 255) { 4134c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextCharLL(); 4135c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4136c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The last digit made the number too big. Forget we saw it. 4137c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru c.fChar >>= 3; 4138c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4139c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c.fQuoted = TRUE; 4141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 4142c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (peekCharLL() == chQ) { 4143c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // "\Q" enter quote mode, which will continue until "\E" 4144c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fQuoteMode = TRUE; 4145c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextCharLL(); // discard the 'Q'. 4146c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(c); // recurse to get the real next char. 4147c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4148b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru else 4149b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru { 4150b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // We are in a '\' escape that will be handled by the state table scanner. 4151b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Just return the backslash, but remember that the following char is to 4152c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // be taken literally. 4153b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fInBackslashQuote = TRUE; 4154b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4155b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4156b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4157b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4158b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // re-enable # to end-of-line comments, in case they were disabled. 4159b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // They are disabled by the parser upon seeing '(?', but this lasts for 4160b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // the fetching of the next character only. 4161b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru fEOLComments = TRUE; 4162b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4163b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // putc(c.fChar, stdout); 4164b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4165b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4166b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4167b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4168b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 4169b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4170c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// scanNamedChar 41711b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. 4172c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4173c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// The scan position will be at the 'N'. On return 4174c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// the scan position should be just after the '}' 4175b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4176c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Return the UChar32 4177b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4178b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 4179c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUChar32 RegexCompile::scanNamedChar() { 4180b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 4181c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 4182b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4183b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4184c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4185c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar != chLBrace) { 4186c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 4187c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 4188b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 4190c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString charName; 4191b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4192c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4193c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == chRBrace) { 4194b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4195b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4196c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == -1) { 4197c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 4198c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 4199c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4200c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru charName.append(fC.fChar); 4201b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 4203c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru char name[100]; 4204c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || 4205c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru (uint32_t)charName.length()>=sizeof(name)) { 4206c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // All Unicode character names have only invariant characters. 4207c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The API to get a character, given a name, accepts only char *, forcing us to convert, 4208c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // which requires this error check 4209c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 4210c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return 0; 4211c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4212c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru charName.extract(0, charName.length(), name, sizeof(name), US_INV); 4213b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4214c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); 4215c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 4216c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 4217c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4218b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4219c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); // Continue overall regex pattern processing with char after the '}' 4220c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return theChar; 4221c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4222b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4223b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 4224b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4225b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// scanProp Construct a UnicodeSet from the text at the current scan 4226b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// position, which will be of the form \p{whaterver} 4227b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4228b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// The scan position will be at the 'p' or 'P'. On return 4229b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// the scan position should be just after the '}' 4230b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4231b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// Return a UnicodeSet, constructed from the \P pattern, 4232b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// or NULL if the pattern is invalid. 4233b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru// 4234b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru//------------------------------------------------------------------------------ 4235b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruUnicodeSet *RegexCompile::scanProp() { 4236b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru UnicodeSet *uset = NULL; 4237b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4238b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 4239b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 4240b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (void)chLowerP; // Suppress compiler unused variable warning. 4242c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); 4243c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool negated = (fC.fChar == chP); 4244b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4245c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString propertyName; 4246c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4247c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar != chLBrace) { 4248c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 4249c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 4250c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4251b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru for (;;) { 4252c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4253b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fC.fChar == chRBrace) { 4254b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru break; 4255b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4256b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fC.fChar == -1) { 4257b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru // Hit the end of the input string without finding the closing '}' 4258b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru error(U_REGEX_PROPERTY_SYNTAX); 4259b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru return NULL; 4260b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4261c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru propertyName.append(fC.fChar); 4262c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4263c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uset = createSetForProperty(propertyName, negated); 4264c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); // Move input scan to position following the closing '}' 4265c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return uset; 4266c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4267c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4268c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//------------------------------------------------------------------------------ 4269c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4270c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// scanPosixProp Construct a UnicodeSet from the text at the current scan 4271c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// position, which is expected be of the form [:property expression:] 4272c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4273c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// The scan position will be at the opening ':'. On return 4274c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// the scan position must be on the closing ']' 4275c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4276c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Return a UnicodeSet constructed from the pattern, 4277c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// or NULL if this is not a valid POSIX-style set expression. 4278c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// If not a property expression, restore the initial scan position 4279c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// (to the opening ':') 4280c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4281c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Note: the opening '[:' is not sufficient to guarantee that 4282c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// this is a [:property:] expression. 4283c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// [:'+=,] is a perfectly good ordinary set expression that 4284c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// happens to include ':' as one of its characters. 4285c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4286c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru//------------------------------------------------------------------------------ 4287c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUnicodeSet *RegexCompile::scanPosixProp() { 4288c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *uset = NULL; 4289c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4290c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 4291c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 4292b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4293b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4294c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fC.fChar == chColon); 4295c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4296c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Save the scanner state. 4297c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 429850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t savedScanIndex = fScanIndex; 429950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 4300c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool savedQuoteMode = fQuoteMode; 4301c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool savedInBackslashQuote = fInBackslashQuote; 4302c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool savedEOLComments = fEOLComments; 430350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t savedLineNum = fLineNum; 430450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int64_t savedCharNum = fCharNum; 4305c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 savedLastChar = fLastChar; 4306c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UChar32 savedPeekChar = fPeekChar; 4307c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru RegexPatternChar savedfC = fC; 4308c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4309c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scan for a closing ]. A little tricky because there are some perverse 4310c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, 4311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // ending on the second closing ]. 4312c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4313c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString propName; 4314c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool negated = FALSE; 4315c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4316c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] 4317c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4318c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == chUp) { 4319c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru negated = TRUE; 4320c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4321c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 4323c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Scan for the closing ":]", collecting the property name along the way. 4324c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UBool sawPropSetTerminator = FALSE; 4325c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (;;) { 4326c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru propName.append(fC.fChar); 4327c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4328c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fQuoted || fC.fChar == -1) { 4329c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Escaped characters or end of input - either says this isn't a [:Property:] 4330c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4331c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4332c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == chColon) { 4333c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru nextChar(fC); 4334c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (fC.fChar == chRBracket) { 4335c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru sawPropSetTerminator = TRUE; 4336c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4337c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4338c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4339c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 4341c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (sawPropSetTerminator) { 4342c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uset = createSetForProperty(propName, negated); 4343c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4344c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else 4345c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 4346c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // No closing ":]". 4347c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Restore the original scan position. 4348c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The main scanner will retry the input as a normal set expression, 4349c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // not a [:Property:] expression. 4350c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fScanIndex = savedScanIndex; 4351c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fQuoteMode = savedQuoteMode; 4352c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fInBackslashQuote = savedInBackslashQuote; 4353c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fEOLComments = savedEOLComments; 4354c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLineNum = savedLineNum; 4355c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fCharNum = savedCharNum; 4356c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fLastChar = savedLastChar; 4357c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fPeekChar = savedPeekChar; 4358c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fC = savedfC; 435950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); 4360c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4361c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return uset; 4362c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4363c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4364c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { 4365c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); 4366c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_CF_MASK, ec); 4367c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4368c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4369c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4370c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Create a Unicode Set from a Unicode Property expression. 4371c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// This is common code underlying both \p{...} ane [:...:] expressions. 4372c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Includes trying the Java "properties" that aren't supported as 4373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// normal ICU UnicodeSet properties 4374c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4375c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" 4376c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Querustatic const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" 4377c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste QueruUnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { 4378c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString setExpr; 4379c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *set; 4380c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru uint32_t usetFlags = 0; 4381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 4382c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 4383c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return NULL; 4384c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4385c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4386c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4387c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // First try the property as we received it 4388c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4389c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (negated) { 4390c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(negSetPrefix, -1); 4391c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } else { 4392c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(posSetPrefix, -1); 4393c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4394c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(propName); 4395c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(chRBrace); 4396c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(chRBracket); 4397b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 4398b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru usetFlags |= USET_CASE_INSENSITIVE; 4399b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4400c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4401c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_SUCCESS(*fStatus)) { 4402c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return set; 4403c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4404c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete set; 4405c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = NULL; 4406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 4407c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4408c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // The property as it was didn't work. 440927f654740f2a26ad62a5c155af9199af9e69b889claireho 4410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX 441127f654740f2a26ad62a5c155af9199af9e69b889claireho // or standard Java, but many other regular expression packages do recognize it. 4412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 441327f654740f2a26ad62a5c155af9199af9e69b889claireho if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { 441427f654740f2a26ad62a5c155af9199af9e69b889claireho *fStatus = U_ZERO_ERROR; 441527f654740f2a26ad62a5c155af9199af9e69b889claireho set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); 441627f654740f2a26ad62a5c155af9199af9e69b889claireho if (set == NULL) { 441727f654740f2a26ad62a5c155af9199af9e69b889claireho *fStatus = U_MEMORY_ALLOCATION_ERROR; 441827f654740f2a26ad62a5c155af9199af9e69b889claireho return set; 441927f654740f2a26ad62a5c155af9199af9e69b889claireho } 442027f654740f2a26ad62a5c155af9199af9e69b889claireho if (negated) { 442127f654740f2a26ad62a5c155af9199af9e69b889claireho set->complement(); 442227f654740f2a26ad62a5c155af9199af9e69b889claireho } 442327f654740f2a26ad62a5c155af9199af9e69b889claireho return set; 442427f654740f2a26ad62a5c155af9199af9e69b889claireho } 442527f654740f2a26ad62a5c155af9199af9e69b889claireho 442627f654740f2a26ad62a5c155af9199af9e69b889claireho 442727f654740f2a26ad62a5c155af9199af9e69b889claireho // Do Java fixes - 4428c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // InGreek -> InGreek or Coptic, that being the official Unicode name for that block. 4429c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols. 4430c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4431c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols" 4432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // is accepted by Java. The property part of the name is compared 4433c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // case-insenstively. The spaces must be exactly as shown, either 4434c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // all there, or all omitted, with exactly one at each position 4435c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // if they are present. From checking against JDK 1.6 4436c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4437c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // This code should be removed when ICU properties support the Java compatibility names 4438c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // (ICU 4.0?) 4439c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4440c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeString mPropName = propName; 4441c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { 4442c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); 4443c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4444c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 || 4445c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) { 4446c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols"); 4447c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4448c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4449c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); 4450c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4451fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 4452c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // See if the property looks like a Java "InBlockName", which 4453c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // we will recast as "Block=BlockName" 4454c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4455c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru static const UChar IN[] = {0x49, 0x6E, 0}; // "In" 4456c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block=" 4457c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mPropName.startsWith(IN, 2) && propName.length()>=3) { 4458c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.truncate(4); // Leaves "[\p{", or "[\P{" 4459c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(BLOCK, -1); 4460c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed. 4461c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(chRBrace); 4462c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setExpr.append(chRBracket); 4463c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *fStatus = U_ZERO_ERROR; 4464c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4465c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_SUCCESS(*fStatus)) { 4466c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return set; 4467c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4468c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete set; 4469c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = NULL; 4470b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4471b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4472c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) || 4473c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru propName.compare(UNICODE_STRING_SIMPLE("all")) == 0) 4474c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru { 4475c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UErrorCode localStatus = U_ZERO_ERROR; 4476c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru //setExpr.remove(); 4477c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = new UnicodeSet(); 4478c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4479c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // Try the various Java specific properties. 4480c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // These all begin with "java" 4481c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // 4482c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) { 4483c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_CN_MASK, localStatus); 4484c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->complement(); 4485c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4486c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) { 4487c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, localStatus); 4488c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4489c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) { 4490c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addIdentifierIgnorable(set, localStatus); 4491c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4492c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) { 4493c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0, 0x1F).add(0x7F, 0x9F); 4494c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4495c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) { 4496c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4497c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_SC_MASK, localStatus); 4498c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_PC_MASK, localStatus); 4499c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, localStatus); 4500c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_NL_MASK, localStatus); 4501c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_MC_MASK, localStatus); 4502c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_MN_MASK, localStatus); 4503c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addIdentifierIgnorable(set, localStatus); 4504c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4505c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) { 4506c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4507c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_NL_MASK, localStatus); 4508c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_SC_MASK, localStatus); 4509c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_PC_MASK, localStatus); 4510c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4511c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) { 4512c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4513c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4514c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) { 4515c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4516c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, localStatus); 4517c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4518c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) { 4519c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_LL_MASK, localStatus); 4520c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4521c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) { 4522c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus); 4523c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4524c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) { 4525c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_Z_MASK, localStatus); 4526c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4527c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) { 4528c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0x10000, UnicodeSet::MAX_VALUE); 4529c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4530c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) { 4531c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_LT_MASK, localStatus); 4532c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4533c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) { 4534c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4535c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_NL_MASK, localStatus); 4536c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4537c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) { 4538c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_L_MASK, localStatus); 4539c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_PC_MASK, localStatus); 4540c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_ND_MASK, localStatus); 4541c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_NL_MASK, localStatus); 4542c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_MC_MASK, localStatus); 4543c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_MN_MASK, localStatus); 4544c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addIdentifierIgnorable(set, localStatus); 4545c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4546c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) { 4547c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_LU_MASK, localStatus); 4548c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4549c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) { 4550c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0, UnicodeSet::MAX_VALUE); 4551c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4552c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) { 4553c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru addCategory(set, U_GC_Z_MASK, localStatus); 4554c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); 4555c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(9, 0x0d).add(0x1c, 0x1f); 4556c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4557c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4558c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->add(0, UnicodeSet::MAX_VALUE); 4559c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4560c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4561c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (U_SUCCESS(localStatus) && !set->isEmpty()) { 4562c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru *fStatus = U_ZERO_ERROR; 4563c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (usetFlags & USET_CASE_INSENSITIVE) { 4564c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->closeOver(USET_CASE_INSENSITIVE); 4565c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4566c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if (negated) { 4567c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set->complement(); 4568c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4569c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru return set; 4570c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4571c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete set; 4572c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru set = NULL; 4573b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru } 4574c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru error(*fStatus); 4575fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 4576c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru} 4577b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4578c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4579c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4580c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4581c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// SetEval Part of the evaluation of [set expressions]. 4582c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// Perform any pending (stacked) operations with precedence 4583c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// equal or greater to that of the next operator encountered 4584c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// in the expression. 4585c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru// 4586c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruvoid RegexCompile::setEval(int32_t nextOp) { 4587c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *rightOperand = NULL; 4588c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru UnicodeSet *leftOperand = NULL; 4589c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru for (;;) { 4590c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fSetOpStack.empty()==FALSE); 4591c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru int32_t pendingSetOperation = fSetOpStack.peeki(); 4592c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { 4593c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4594c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4595c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.popi(); 4596c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(fSetStack.empty() == FALSE); 4597c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru rightOperand = (UnicodeSet *)fSetStack.peek(); 4598c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru switch (pendingSetOperation) { 4599c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setNegation: 4600c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru rightOperand->complement(); 4601c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4602c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setCaseClose: 4603c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru // TODO: need a simple close function. Ticket 6065 4604c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru rightOperand->closeOver(USET_CASE_INSENSITIVE); 4605c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru rightOperand->removeAllStrings(); 4606c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4607c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setDifference1: 4608c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setDifference2: 4609c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.pop(); 4610c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand = (UnicodeSet *)fSetStack.peek(); 4611c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand->removeAll(*rightOperand); 4612c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete rightOperand; 4613c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4614c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setIntersection1: 4615c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setIntersection2: 4616c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.pop(); 4617c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand = (UnicodeSet *)fSetStack.peek(); 4618c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand->retainAll(*rightOperand); 4619c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete rightOperand; 4620c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4621c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru case setUnion: 4622c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.pop(); 4623c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand = (UnicodeSet *)fSetStack.peek(); 4624c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru leftOperand->addAll(*rightOperand); 4625c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru delete rightOperand; 4626c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4627c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru default: 4628c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru U_ASSERT(FALSE); 4629c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru break; 4630c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4631c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4632c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru } 4633c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru 4634c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queruvoid RegexCompile::setPushOp(int32_t op) { 4635c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru setEval(op); 4636c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetOpStack.push(op, *fStatus); 4637c69afcec261fc345fda8daf46f0ea6b4351dc777Jean-Baptiste Queru fSetStack.push(new UnicodeSet(), *fStatus); 4638b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru} 4639b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4640b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste QueruU_NAMESPACE_END 4641b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 4642b13da9df870a61b11249bf741347908dbea0edd8Jean-Baptiste Queru 4643