regexcmp.cpp revision 1b7d32f919554dda9c193b32188251337bc756f1
1//
2//  file:  regexcmp.cpp
3//
4//  Copyright (C) 2002-2015 International Business Machines Corporation and others.
5//  All Rights Reserved.
6//
7//  This file contains the ICU regular expression compiler, which is responsible
8//  for processing a regular expression pattern into the compiled form that
9//  is used by the match finding engine.
10//
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16#include "unicode/ustring.h"
17#include "unicode/unistr.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/uchriter.h"
21#include "unicode/parsepos.h"
22#include "unicode/parseerr.h"
23#include "unicode/regex.h"
24#include "unicode/utf.h"
25#include "unicode/utf16.h"
26#include "patternprops.h"
27#include "putilimp.h"
28#include "cmemory.h"
29#include "cstring.h"
30#include "uvectr32.h"
31#include "uvectr64.h"
32#include "uassert.h"
33#include "uinvchar.h"
34
35#include "regeximp.h"
36#include "regexcst.h"   // Contains state table for the regex pattern parser.
37                        //   generated by a Perl script.
38#include "regexcmp.h"
39#include "regexst.h"
40#include "regextxt.h"
41
42
43
44U_NAMESPACE_BEGIN
45
46
47//------------------------------------------------------------------------------
48//
49//  Constructor.
50//
51//------------------------------------------------------------------------------
52RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
53   fParenStack(status), fSetStack(status), fSetOpStack(status)
54{
55    // Lazy init of all shared global sets (needed for init()'s empty text)
56    RegexStaticSets::initGlobals(&status);
57
58    fStatus           = &status;
59
60    fRXPat            = rxp;
61    fScanIndex        = 0;
62    fLastChar         = -1;
63    fPeekChar         = -1;
64    fLineNum          = 1;
65    fCharNum          = 0;
66    fQuoteMode        = FALSE;
67    fInBackslashQuote = FALSE;
68    fModeFlags        = fRXPat->fFlags | 0x80000000;
69    fEOLComments      = TRUE;
70
71    fMatchOpenParen   = -1;
72    fMatchCloseParen  = -1;
73    fCaptureName      = NULL;
74
75    if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
76        status = rxp->fDeferredStatus;
77    }
78}
79
80static const UChar      chAmp       = 0x26;      // '&'
81static const UChar      chDash      = 0x2d;      // '-'
82
83
84//------------------------------------------------------------------------------
85//
86//  Destructor
87//
88//------------------------------------------------------------------------------
89RegexCompile::~RegexCompile() {
90    delete fCaptureName;         // Normally will be NULL, but can exist if pattern
91                                 //   compilation stops with a syntax error.
92}
93
94static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
95    set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
96}
97
98//------------------------------------------------------------------------------
99//
100//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
101//                           The state tables are hand-written in the file regexcst.txt,
102//                           and converted to the form used here by a perl
103//                           script regexcst.pl
104//
105//------------------------------------------------------------------------------
106void    RegexCompile::compile(
107                         const UnicodeString &pat,   // Source pat to be compiled.
108                         UParseError &pp,            // Error position info
109                         UErrorCode &e)              // Error Code
110{
111    fRXPat->fPatternString = new UnicodeString(pat);
112    UText patternText = UTEXT_INITIALIZER;
113    utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
114
115    if (U_SUCCESS(e)) {
116        compile(&patternText, pp, e);
117        utext_close(&patternText);
118    }
119}
120
121//
122//   compile, UText mode
123//     All the work is actually done here.
124//
125void    RegexCompile::compile(
126                         UText *pat,                 // Source pat to be compiled.
127                         UParseError &pp,            // Error position info
128                         UErrorCode &e)              // Error Code
129{
130    fStatus             = &e;
131    fParseErr           = &pp;
132    fStackPtr           = 0;
133    fStack[fStackPtr]   = 0;
134
135    if (U_FAILURE(*fStatus)) {
136        return;
137    }
138
139    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
140    U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
141
142    // Prepare the RegexPattern object to receive the compiled pattern.
143    fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
144    if (U_FAILURE(*fStatus)) {
145        return;
146    }
147    fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
148    fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
149
150
151    // Initialize the pattern scanning state machine
152    fPatternLength = utext_nativeLength(pat);
153    uint16_t                state = 1;
154    const RegexTableEl      *tableEl;
155
156    // UREGEX_LITERAL force entire pattern to be treated as a literal string.
157    if (fModeFlags & UREGEX_LITERAL) {
158        fQuoteMode = TRUE;
159    }
160
161    nextChar(fC);                        // Fetch the first char from the pattern string.
162
163    //
164    // Main loop for the regex pattern parsing state machine.
165    //   Runs once per state transition.
166    //   Each time through optionally performs, depending on the state table,
167    //      - an advance to the the next pattern char
168    //      - an action to be performed.
169    //      - pushing or popping a state to/from the local state return stack.
170    //   file regexcst.txt is the source for the state table.  The logic behind
171    //     recongizing the pattern syntax is there, not here.
172    //
173    for (;;) {
174        //  Bail out if anything has gone wrong.
175        //  Regex pattern parsing stops on the first error encountered.
176        if (U_FAILURE(*fStatus)) {
177            break;
178        }
179
180        U_ASSERT(state != 0);
181
182        // Find the state table element that matches the input char from the pattern, or the
183        //    class of the input character.  Start with the first table row for this
184        //    state, then linearly scan forward until we find a row that matches the
185        //    character.  The last row for each state always matches all characters, so
186        //    the search will stop there, if not before.
187        //
188        tableEl = &gRuleParseStateTable[state];
189        REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
190            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
191
192        for (;;) {    // loop through table rows belonging to this state, looking for one
193                      //   that matches the current input char.
194            REGEX_SCAN_DEBUG_PRINTF(("."));
195            if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
196                // Table row specified an individual character, not a set, and
197                //   the input character is not quoted, and
198                //   the input character matched it.
199                break;
200            }
201            if (tableEl->fCharClass == 255) {
202                // Table row specified default, match anything character class.
203                break;
204            }
205            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
206                // Table row specified "quoted" and the char was quoted.
207                break;
208            }
209            if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
210                // Table row specified eof and we hit eof on the input.
211                break;
212            }
213
214            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
215                fC.fQuoted == FALSE &&                                       //   char is not escaped &&
216                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
217                U_ASSERT(tableEl->fCharClass <= 137);
218                if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
219                    // Table row specified a character class, or set of characters,
220                    //   and the current char matches it.
221                    break;
222                }
223            }
224
225            // No match on this row, advance to the next  row for this state,
226            tableEl++;
227        }
228        REGEX_SCAN_DEBUG_PRINTF(("\n"));
229
230        //
231        // We've found the row of the state table that matches the current input
232        //   character from the rules string.
233        // Perform any action specified  by this row in the state table.
234        if (doParseActions(tableEl->fAction) == FALSE) {
235            // Break out of the state machine loop if the
236            //   the action signalled some kind of error, or
237            //   the action was to exit, occurs on normal end-of-rules-input.
238            break;
239        }
240
241        if (tableEl->fPushState != 0) {
242            fStackPtr++;
243            if (fStackPtr >= kStackSize) {
244                error(U_REGEX_INTERNAL_ERROR);
245                REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
246                fStackPtr--;
247            }
248            fStack[fStackPtr] = tableEl->fPushState;
249        }
250
251        //
252        //  NextChar.  This is where characters are actually fetched from the pattern.
253        //             Happens under control of the 'n' tag in the state table.
254        //
255        if (tableEl->fNextChar) {
256            nextChar(fC);
257        }
258
259        // Get the next state from the table entry, or from the
260        //   state stack if the next state was specified as "pop".
261        if (tableEl->fNextState != 255) {
262            state = tableEl->fNextState;
263        } else {
264            state = fStack[fStackPtr];
265            fStackPtr--;
266            if (fStackPtr < 0) {
267                // state stack underflow
268                // This will occur if the user pattern has mis-matched parentheses,
269                //   with extra close parens.
270                //
271                fStackPtr++;
272                error(U_REGEX_MISMATCHED_PAREN);
273            }
274        }
275
276    }
277
278    if (U_FAILURE(*fStatus)) {
279        // Bail out if the pattern had errors.
280        //   Set stack cleanup:  a successful compile would have left it empty,
281        //   but errors can leave temporary sets hanging around.
282        while (!fSetStack.empty()) {
283            delete (UnicodeSet *)fSetStack.pop();
284        }
285        return;
286    }
287
288    //
289    // The pattern has now been read and processed, and the compiled code generated.
290    //
291
292    //
293    // The pattern's fFrameSize so far has accumulated the requirements for
294    //   storage for capture parentheses, counters, etc. that are encountered
295    //   in the pattern.  Add space for the two variables that are always
296    //   present in the saved state:  the input string position (int64_t) and
297    //   the position in the compiled pattern.
298    //
299    allocateStackData(RESTACKFRAME_HDRCOUNT);
300
301    //
302    // Optimization pass 1: NOPs, back-references, and case-folding
303    //
304    stripNOPs();
305
306    //
307    // Get bounds for the minimum and maximum length of a string that this
308    //   pattern can match.  Used to avoid looking for matches in strings that
309    //   are too short.
310    //
311    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
312
313    //
314    // Optimization pass 2: match start type
315    //
316    matchStartType();
317
318    //
319    // Set up fast latin-1 range sets
320    //
321    int32_t numSets = fRXPat->fSets->size();
322    fRXPat->fSets8 = new Regex8BitSet[numSets];
323    // Null pointer check.
324    if (fRXPat->fSets8 == NULL) {
325        e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
326        return;
327    }
328    int32_t i;
329    for (i=0; i<numSets; i++) {
330        UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
331        fRXPat->fSets8[i].init(s);
332    }
333
334}
335
336
337
338
339
340//------------------------------------------------------------------------------
341//
342//  doParseAction        Do some action during regex pattern parsing.
343//                       Called by the parse state machine.
344//
345//                       Generation of the match engine PCode happens here, or
346//                       in functions called from the parse actions defined here.
347//
348//
349//------------------------------------------------------------------------------
350UBool RegexCompile::doParseActions(int32_t action)
351{
352    UBool   returnVal = TRUE;
353
354    switch ((Regex_PatternParseAction)action) {
355
356    case doPatStart:
357        // Start of pattern compiles to:
358        //0   SAVE   2        Fall back to position of FAIL
359        //1   jmp    3
360        //2   FAIL            Stop if we ever reach here.
361        //3   NOP             Dummy, so start of pattern looks the same as
362        //                    the start of an ( grouping.
363        //4   NOP             Resreved, will be replaced by a save if there are
364        //                    OR | operators at the top level
365        appendOp(URX_STATE_SAVE, 2);
366        appendOp(URX_JMP,  3);
367        appendOp(URX_FAIL, 0);
368
369        // Standard open nonCapture paren action emits the two NOPs and
370        //   sets up the paren stack frame.
371        doParseActions(doOpenNonCaptureParen);
372        break;
373
374    case doPatFinish:
375        // We've scanned to the end of the pattern
376        //  The end of pattern compiles to:
377        //        URX_END
378        //    which will stop the runtime match engine.
379        //  Encountering end of pattern also behaves like a close paren,
380        //   and forces fixups of the State Save at the beginning of the compiled pattern
381        //   and of any OR operations at the top level.
382        //
383        handleCloseParen();
384        if (fParenStack.size() > 0) {
385            // Missing close paren in pattern.
386            error(U_REGEX_MISMATCHED_PAREN);
387        }
388
389        // add the END operation to the compiled pattern.
390        appendOp(URX_END, 0);
391
392        // Terminate the pattern compilation state machine.
393        returnVal = FALSE;
394        break;
395
396
397
398    case doOrOperator:
399        // Scanning a '|', as in (A|B)
400        {
401            // Generate code for any pending literals preceding the '|'
402            fixLiterals(FALSE);
403
404            // Insert a SAVE operation at the start of the pattern section preceding
405            //   this OR at this level.  This SAVE will branch the match forward
406            //   to the right hand side of the OR in the event that the left hand
407            //   side fails to match and backtracks.  Locate the position for the
408            //   save from the location on the top of the parentheses stack.
409            int32_t savePosition = fParenStack.popi();
410            int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
411            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
412            op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
413            fRXPat->fCompiledPat->setElementAt(op, savePosition);
414
415            // Append an JMP operation into the compiled pattern.  The operand for
416            //  the JMP will eventually be the location following the ')' for the
417            //  group.  This will be patched in later, when the ')' is encountered.
418            appendOp(URX_JMP, 0);
419
420            // Push the position of the newly added JMP op onto the parentheses stack.
421            // This registers if for fixup when this block's close paren is encountered.
422            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
423
424            // Append a NOP to the compiled pattern.  This is the slot reserved
425            //   for a SAVE in the event that there is yet another '|' following
426            //   this one.
427            appendOp(URX_NOP, 0);
428            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
429        }
430        break;
431
432
433    case doBeginNamedCapture:
434        // Scanning (?<letter.
435        //   The first letter of the name will come through again under doConinueNamedCapture.
436        fCaptureName = new UnicodeString();
437        if (fCaptureName == NULL) {
438            error(U_MEMORY_ALLOCATION_ERROR);
439        }
440        break;
441
442    case  doContinueNamedCapture:
443        fCaptureName->append(fC.fChar);
444        break;
445
446    case doBadNamedCapture:
447        error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
448        break;
449
450    case doOpenCaptureParen:
451        // Open Capturing Paren, possibly named.
452        //   Compile to a
453        //      - NOP, which later may be replaced by a save-state if the
454        //         parenthesized group gets a * quantifier, followed by
455        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
456        //      - NOP, which may later be replaced by a save-state if there
457        //             is an '|' alternation within the parens.
458        //
459        //    Each capture group gets three slots in the save stack frame:
460        //         0: Capture Group start position (in input string being matched.)
461        //         1: Capture Group end position.
462        //         2: Start of Match-in-progress.
463        //    The first two locations are for a completed capture group, and are
464        //     referred to by back references and the like.
465        //    The third location stores the capture start position when an START_CAPTURE is
466        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
467        //      END_CAPTURE is encountered.
468        {
469            fixLiterals();
470            appendOp(URX_NOP, 0);
471            int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
472            appendOp(URX_START_CAPTURE, varsLoc);
473            appendOp(URX_NOP, 0);
474
475            // On the Parentheses stack, start a new frame and add the postions
476            //   of the two NOPs.  Depending on what follows in the pattern, the
477            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
478            //   address of the end of the parenthesized group.
479            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
480            fParenStack.push(capturing, *fStatus);                        // Frame type.
481            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
482            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
483
484            // Save the mapping from group number to stack frame variable position.
485            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
486
487            // If this is a named capture group, add the name->group number mapping.
488            if (fCaptureName != NULL) {
489                int32_t groupNumber = fRXPat->fGroupMap->size();
490                int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
491                fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
492                if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
493                    error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
494                }
495            }
496        }
497        break;
498
499    case doOpenNonCaptureParen:
500        // Open non-caputuring (grouping only) Paren.
501        //   Compile to a
502        //      - NOP, which later may be replaced by a save-state if the
503        //         parenthesized group gets a * quantifier, followed by
504        //      - NOP, which may later be replaced by a save-state if there
505        //             is an '|' alternation within the parens.
506        {
507            fixLiterals();
508            appendOp(URX_NOP, 0);
509            appendOp(URX_NOP, 0);
510
511            // On the Parentheses stack, start a new frame and add the postions
512            //   of the two NOPs.
513            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
514            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
515            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
516            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
517        }
518         break;
519
520
521    case doOpenAtomicParen:
522        // Open Atomic Paren.  (?>
523        //   Compile to a
524        //      - NOP, which later may be replaced if the parenthesized group
525        //         has a quantifier, followed by
526        //      - STO_SP  save state stack position, so it can be restored at the ")"
527        //      - NOP, which may later be replaced by a save-state if there
528        //             is an '|' alternation within the parens.
529        {
530            fixLiterals();
531            appendOp(URX_NOP, 0);
532            int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
533            appendOp(URX_STO_SP, varLoc);
534            appendOp(URX_NOP, 0);
535
536            // On the Parentheses stack, start a new frame and add the postions
537            //   of the two NOPs.  Depending on what follows in the pattern, the
538            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
539            //   address of the end of the parenthesized group.
540            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
541            fParenStack.push(atomic, *fStatus);                           // Frame type.
542            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
543            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
544        }
545        break;
546
547
548    case doOpenLookAhead:
549        // Positive Look-ahead   (?=  stuff  )
550        //
551        //   Note:   Addition of transparent input regions, with the need to
552        //           restore the original regions when failing out of a lookahead
553        //           block, complicated this sequence.  Some conbined opcodes
554        //           might make sense - or might not, lookahead aren't that common.
555        //
556        //      Caution:  min match length optimization knows about this
557        //               sequence; don't change without making updates there too.
558        //
559        // Compiles to
560        //    1    START_LA     dataLoc     Saves SP, Input Pos
561        //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
562        //    3    JMP          6           continue ...
563        //
564        //    4.   LA_END                   Look Ahead failed.  Restore regions.
565        //    5.   BACKTRACK                and back track again.
566        //
567        //    6.   NOP              reserved for use by quantifiers on the block.
568        //                          Look-ahead can't have quantifiers, but paren stack
569        //                             compile time conventions require the slot anyhow.
570        //    7.   NOP              may be replaced if there is are '|' ops in the block.
571        //    8.     code for parenthesized stuff.
572        //    9.   LA_END
573        //
574        //  Two data slots are reserved, for saving the stack ptr and the input position.
575        {
576            fixLiterals();
577            int32_t dataLoc = allocateData(2);
578            appendOp(URX_LA_START, dataLoc);
579            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
580            appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
581            appendOp(URX_LA_END, dataLoc);
582            appendOp(URX_BACKTRACK, 0);
583            appendOp(URX_NOP, 0);
584            appendOp(URX_NOP, 0);
585
586            // On the Parentheses stack, start a new frame and add the postions
587            //   of the NOPs.
588            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
589            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
590            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
591            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
592        }
593        break;
594
595    case doOpenLookAheadNeg:
596        // Negated Lookahead.   (?! stuff )
597        // Compiles to
598        //    1.    START_LA    dataloc
599        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
600        //                                //   which continues with the match.
601        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
602        //    4.       code for parenthesized stuff.
603        //    5.    END_LA                // Cut back stack, remove saved state from step 2.
604        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
605        //    7.    END_LA                // Restore match region, in case look-ahead was using
606        //                                        an alternate (transparent) region.
607        {
608            fixLiterals();
609            int32_t dataLoc = allocateData(2);
610            appendOp(URX_LA_START, dataLoc);
611            appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
612            appendOp(URX_NOP, 0);
613
614            // On the Parentheses stack, start a new frame and add the postions
615            //   of the StateSave and NOP.
616            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
617            fParenStack.push(negLookAhead, *fStatus);                    // Frame type
618            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
619            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
620
621            // Instructions #5 - #7 will be added when the ')' is encountered.
622        }
623        break;
624
625    case doOpenLookBehind:
626        {
627            //   Compile a (?<= look-behind open paren.
628            //
629            //          Compiles to
630            //              0       URX_LB_START     dataLoc
631            //              1       URX_LB_CONT      dataLoc
632            //              2                        MinMatchLen
633            //              3                        MaxMatchLen
634            //              4       URX_NOP          Standard '(' boilerplate.
635            //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
636            //              6         <code for LookBehind expression>
637            //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
638            //              8       URX_LA_END       dataLoc    # Restore stack, input pos
639            //
640            //          Allocate a block of matcher data, to contain (when running a match)
641            //              0:    Stack ptr on entry
642            //              1:    Input Index on entry
643            //              2:    Start index of match current match attempt.
644            //              3:    Original Input String len.
645
646            // Generate match code for any pending literals.
647            fixLiterals();
648
649            // Allocate data space
650            int32_t dataLoc = allocateData(4);
651
652            // Emit URX_LB_START
653            appendOp(URX_LB_START, dataLoc);
654
655            // Emit URX_LB_CONT
656            appendOp(URX_LB_CONT, dataLoc);
657            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
658            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
659
660            // Emit the NOPs
661            appendOp(URX_NOP, 0);
662            appendOp(URX_NOP, 0);
663
664            // On the Parentheses stack, start a new frame and add the postions
665            //   of the URX_LB_CONT and the NOP.
666            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
667            fParenStack.push(lookBehind, *fStatus);                       // Frame type
668            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
669            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
670
671            // The final two instructions will be added when the ')' is encountered.
672        }
673
674        break;
675
676    case doOpenLookBehindNeg:
677        {
678            //   Compile a (?<! negated look-behind open paren.
679            //
680            //          Compiles to
681            //              0       URX_LB_START     dataLoc    # Save entry stack, input len
682            //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
683            //              2                        MinMatchLen
684            //              3                        MaxMatchLen
685            //              4                        continueLoc (9)
686            //              5       URX_NOP          Standard '(' boilerplate.
687            //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
688            //              7         <code for LookBehind expression>
689            //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
690            //              9       ...
691            //
692            //          Allocate a block of matcher data, to contain (when running a match)
693            //              0:    Stack ptr on entry
694            //              1:    Input Index on entry
695            //              2:    Start index of match current match attempt.
696            //              3:    Original Input String len.
697
698            // Generate match code for any pending literals.
699            fixLiterals();
700
701            // Allocate data space
702            int32_t dataLoc = allocateData(4);
703
704            // Emit URX_LB_START
705            appendOp(URX_LB_START, dataLoc);
706
707            // Emit URX_LBN_CONT
708            appendOp(URX_LBN_CONT, dataLoc);
709            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
710            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
711            appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
712
713            // Emit the NOPs
714            appendOp(URX_NOP, 0);
715            appendOp(URX_NOP, 0);
716
717            // On the Parentheses stack, start a new frame and add the postions
718            //   of the URX_LB_CONT and the NOP.
719            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
720            fParenStack.push(lookBehindN, *fStatus);                      // Frame type
721            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
722            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
723
724            // The final two instructions will be added when the ')' is encountered.
725        }
726        break;
727
728    case doConditionalExpr:
729        // Conditionals such as (?(1)a:b)
730    case doPerlInline:
731        // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
732        error(U_REGEX_UNIMPLEMENTED);
733        break;
734
735
736    case doCloseParen:
737        handleCloseParen();
738        if (fParenStack.size() <= 0) {
739            //  Extra close paren, or missing open paren.
740            error(U_REGEX_MISMATCHED_PAREN);
741        }
742        break;
743
744    case doNOP:
745        break;
746
747
748    case doBadOpenParenType:
749    case doRuleError:
750        error(U_REGEX_RULE_SYNTAX);
751        break;
752
753
754    case doMismatchedParenErr:
755        error(U_REGEX_MISMATCHED_PAREN);
756        break;
757
758    case doPlus:
759        //  Normal '+'  compiles to
760        //     1.   stuff to be repeated  (already built)
761        //     2.   jmp-sav 1
762        //     3.   ...
763        //
764        //  Or, if the item to be repeated can match a zero length string,
765        //     1.   STO_INP_LOC  data-loc
766        //     2.      body of stuff to be repeated
767        //     3.   JMP_SAV_X    2
768        //     4.   ...
769
770        //
771        //  Or, if the item to be repeated is simple
772        //     1.   Item to be repeated.
773        //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
774        //     3.   LOOP_C       stack location
775        {
776            int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
777            int32_t  frameLoc;
778
779            // Check for simple constructs, which may get special optimized code.
780            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
781                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
782
783                if (URX_TYPE(repeatedOp) == URX_SETREF) {
784                    // Emit optimized code for [char set]+
785                    appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
786                    frameLoc = allocateStackData(1);
787                    appendOp(URX_LOOP_C, frameLoc);
788                    break;
789                }
790
791                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
792                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
793                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
794                    // Emit Optimized code for .+ operations.
795                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
796                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
797                        // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
798                        loopOpI |= 1;
799                    }
800                    if (fModeFlags & UREGEX_UNIX_LINES) {
801                        loopOpI |= 2;
802                    }
803                    appendOp(loopOpI);
804                    frameLoc = allocateStackData(1);
805                    appendOp(URX_LOOP_C, frameLoc);
806                    break;
807                }
808
809            }
810
811            // General case.
812
813            // Check for minimum match length of zero, which requires
814            //    extra loop-breaking code.
815            if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
816                // Zero length match is possible.
817                // Emit the code sequence that can handle it.
818                insertOp(topLoc);
819                frameLoc = allocateStackData(1);
820
821                int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
822                fRXPat->fCompiledPat->setElementAt(op, topLoc);
823
824                appendOp(URX_JMP_SAV_X, topLoc+1);
825            } else {
826                // Simpler code when the repeated body must match something non-empty
827                appendOp(URX_JMP_SAV, topLoc);
828            }
829        }
830        break;
831
832    case doNGPlus:
833        //  Non-greedy '+?'  compiles to
834        //     1.   stuff to be repeated  (already built)
835        //     2.   state-save  1
836        //     3.   ...
837        {
838            int32_t topLoc      = blockTopLoc(FALSE);
839            appendOp(URX_STATE_SAVE, topLoc);
840        }
841        break;
842
843
844    case doOpt:
845        // Normal (greedy) ? quantifier.
846        //  Compiles to
847        //     1. state save 3
848        //     2.    body of optional block
849        //     3. ...
850        // Insert the state save into the compiled pattern, and we're done.
851        {
852            int32_t   saveStateLoc = blockTopLoc(TRUE);
853            int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
854            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
855        }
856        break;
857
858    case doNGOpt:
859        // Non-greedy ?? quantifier
860        //   compiles to
861        //    1.  jmp   4
862        //    2.     body of optional block
863        //    3   jmp   5
864        //    4.  state save 2
865        //    5    ...
866        //  This code is less than ideal, with two jmps instead of one, because we can only
867        //  insert one instruction at the top of the block being iterated.
868        {
869            int32_t  jmp1_loc = blockTopLoc(TRUE);
870            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
871
872            int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
873            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
874
875            appendOp(URX_JMP, jmp2_loc+2);
876
877            appendOp(URX_STATE_SAVE, jmp1_loc+1);
878        }
879        break;
880
881
882    case doStar:
883        // Normal (greedy) * quantifier.
884        // Compiles to
885        //       1.   STATE_SAVE   4
886        //       2.      body of stuff being iterated over
887        //       3.   JMP_SAV      2
888        //       4.   ...
889        //
890        // Or, if the body is a simple [Set],
891        //       1.   LOOP_SR_I    set number
892        //       2.   LOOP_C       stack location
893        //       ...
894        //
895        // Or if this is a .*
896        //       1.   LOOP_DOT_I    (. matches all mode flag)
897        //       2.   LOOP_C        stack location
898        //
899        // Or, if the body can match a zero-length string, to inhibit infinite loops,
900        //       1.   STATE_SAVE   5
901        //       2.   STO_INP_LOC  data-loc
902        //       3.      body of stuff
903        //       4.   JMP_SAV_X    2
904        //       5.   ...
905        {
906            // location of item #1, the STATE_SAVE
907            int32_t   topLoc = blockTopLoc(FALSE);
908            int32_t   dataLoc = -1;
909
910            // Check for simple *, where the construct being repeated
911            //   compiled to single opcode, and might be optimizable.
912            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
913                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
914
915                if (URX_TYPE(repeatedOp) == URX_SETREF) {
916                    // Emit optimized code for a [char set]*
917                    int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
918                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
919                    dataLoc = allocateStackData(1);
920                    appendOp(URX_LOOP_C, dataLoc);
921                    break;
922                }
923
924                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
925                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
926                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
927                    // Emit Optimized code for .* operations.
928                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
929                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
930                        // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
931                        loopOpI |= 1;
932                    }
933                    if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
934                        loopOpI |= 2;
935                    }
936                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
937                    dataLoc = allocateStackData(1);
938                    appendOp(URX_LOOP_C, dataLoc);
939                    break;
940                }
941            }
942
943            // Emit general case code for this *
944            // The optimizations did not apply.
945
946            int32_t   saveStateLoc = blockTopLoc(TRUE);
947            int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
948
949            // Check for minimum match length of zero, which requires
950            //    extra loop-breaking code.
951            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
952                insertOp(saveStateLoc);
953                dataLoc = allocateStackData(1);
954
955                int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
956                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
957                jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
958            }
959
960            // Locate the position in the compiled pattern where the match will continue
961            //   after completing the *.   (4 or 5 in the comment above)
962            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
963
964            // Put together the save state op and store it into the compiled code.
965            int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
966            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
967
968            // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
969            appendOp(jmpOp);
970        }
971        break;
972
973    case doNGStar:
974        // Non-greedy *? quantifier
975        // compiles to
976        //     1.   JMP    3
977        //     2.      body of stuff being iterated over
978        //     3.   STATE_SAVE  2
979        //     4    ...
980        {
981            int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
982            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
983            int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
984            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
985            appendOp(URX_STATE_SAVE, jmpLoc+1);
986        }
987        break;
988
989
990    case doIntervalInit:
991        // The '{' opening an interval quantifier was just scanned.
992        // Init the counter varaiables that will accumulate the values as the digits
993        //    are scanned.
994        fIntervalLow = 0;
995        fIntervalUpper = -1;
996        break;
997
998    case doIntevalLowerDigit:
999        // Scanned a digit from the lower value of an {lower,upper} interval
1000        {
1001            int32_t digitValue = u_charDigitValue(fC.fChar);
1002            U_ASSERT(digitValue >= 0);
1003            int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1004            if (val > INT32_MAX) {
1005                error(U_REGEX_NUMBER_TOO_BIG);
1006            } else {
1007                fIntervalLow = (int32_t)val;
1008            }
1009        }
1010        break;
1011
1012    case doIntervalUpperDigit:
1013        // Scanned a digit from the upper value of an {lower,upper} interval
1014        {
1015            if (fIntervalUpper < 0) {
1016                fIntervalUpper = 0;
1017            }
1018            int32_t digitValue = u_charDigitValue(fC.fChar);
1019            U_ASSERT(digitValue >= 0);
1020            int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1021            if (val > INT32_MAX) {
1022                error(U_REGEX_NUMBER_TOO_BIG);
1023            } else {
1024                fIntervalUpper = (int32_t)val;
1025            }
1026        }
1027        break;
1028
1029    case doIntervalSame:
1030        // Scanned a single value interval like {27}.  Upper = Lower.
1031        fIntervalUpper = fIntervalLow;
1032        break;
1033
1034    case doInterval:
1035        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1036        if (compileInlineInterval() == FALSE) {
1037            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1038        }
1039        break;
1040
1041    case doPossessiveInterval:
1042        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1043        {
1044            // Remember the loc for the top of the block being looped over.
1045            //   (Can not reserve a slot in the compiled pattern at this time, because
1046            //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1047            //    once per block.)
1048            int32_t topLoc = blockTopLoc(FALSE);
1049
1050            // Produce normal looping code.
1051            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1052
1053            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1054            //  just as if the loop was inclosed in atomic parentheses.
1055
1056            // First the STO_SP before the start of the loop
1057            insertOp(topLoc);
1058
1059            int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1060            int32_t  op     = buildOp(URX_STO_SP, varLoc);
1061            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1062
1063            int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1064            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1065            loopOp++;     // point LoopOp after the just-inserted STO_SP
1066            fRXPat->fCompiledPat->push(loopOp, *fStatus);
1067
1068            // Then the LD_SP after the end of the loop
1069            appendOp(URX_LD_SP, varLoc);
1070        }
1071
1072        break;
1073
1074    case doNGInterval:
1075        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1076        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1077        break;
1078
1079    case doIntervalError:
1080        error(U_REGEX_BAD_INTERVAL);
1081        break;
1082
1083    case doLiteralChar:
1084        // We've just scanned a "normal" character from the pattern,
1085        literalChar(fC.fChar);
1086        break;
1087
1088
1089    case doEscapedLiteralChar:
1090        // We've just scanned an backslashed escaped character with  no
1091        //   special meaning.  It represents itself.
1092        if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1093            ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1094            (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1095               error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1096             }
1097        literalChar(fC.fChar);
1098        break;
1099
1100
1101    case doDotAny:
1102        // scanned a ".",  match any single character.
1103        {
1104            fixLiterals(FALSE);
1105            if (fModeFlags & UREGEX_DOTALL) {
1106                appendOp(URX_DOTANY_ALL, 0);
1107            } else if (fModeFlags & UREGEX_UNIX_LINES) {
1108                appendOp(URX_DOTANY_UNIX, 0);
1109            } else {
1110                appendOp(URX_DOTANY, 0);
1111            }
1112        }
1113        break;
1114
1115    case doCaret:
1116        {
1117            fixLiterals(FALSE);
1118            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1119                appendOp(URX_CARET, 0);
1120            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1121                appendOp(URX_CARET_M, 0);
1122            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1123                appendOp(URX_CARET, 0);   // Only testing true start of input.
1124            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1125                appendOp(URX_CARET_M_UNIX, 0);
1126            }
1127        }
1128        break;
1129
1130    case doDollar:
1131        {
1132            fixLiterals(FALSE);
1133            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134                appendOp(URX_DOLLAR, 0);
1135            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136                appendOp(URX_DOLLAR_M, 0);
1137            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138                appendOp(URX_DOLLAR_D, 0);
1139            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140                appendOp(URX_DOLLAR_MD, 0);
1141            }
1142        }
1143        break;
1144
1145    case doBackslashA:
1146        fixLiterals(FALSE);
1147        appendOp(URX_CARET, 0);
1148        break;
1149
1150    case doBackslashB:
1151        {
1152            #if  UCONFIG_NO_BREAK_ITERATION==1
1153            if (fModeFlags & UREGEX_UWORD) {
1154                error(U_UNSUPPORTED_ERROR);
1155            }
1156            #endif
1157            fixLiterals(FALSE);
1158            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1159            appendOp(op, 1);
1160        }
1161        break;
1162
1163    case doBackslashb:
1164        {
1165            #if  UCONFIG_NO_BREAK_ITERATION==1
1166            if (fModeFlags & UREGEX_UWORD) {
1167                error(U_UNSUPPORTED_ERROR);
1168            }
1169            #endif
1170            fixLiterals(FALSE);
1171            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1172            appendOp(op, 0);
1173        }
1174        break;
1175
1176    case doBackslashD:
1177        fixLiterals(FALSE);
1178        appendOp(URX_BACKSLASH_D, 1);
1179        break;
1180
1181    case doBackslashd:
1182        fixLiterals(FALSE);
1183        appendOp(URX_BACKSLASH_D, 0);
1184        break;
1185
1186    case doBackslashG:
1187        fixLiterals(FALSE);
1188        appendOp(URX_BACKSLASH_G, 0);
1189        break;
1190
1191    case doBackslashH:
1192        fixLiterals(FALSE);
1193        appendOp(URX_BACKSLASH_H, 1);
1194        break;
1195
1196    case doBackslashh:
1197        fixLiterals(FALSE);
1198        appendOp(URX_BACKSLASH_H, 0);
1199        break;
1200
1201    case doBackslashR:
1202        fixLiterals(FALSE);
1203        appendOp(URX_BACKSLASH_R, 0);
1204        break;
1205
1206    case doBackslashS:
1207        fixLiterals(FALSE);
1208        appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1209        break;
1210
1211    case doBackslashs:
1212        fixLiterals(FALSE);
1213        appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1214        break;
1215
1216    case doBackslashV:
1217        fixLiterals(FALSE);
1218        appendOp(URX_BACKSLASH_V, 1);
1219        break;
1220
1221    case doBackslashv:
1222        fixLiterals(FALSE);
1223        appendOp(URX_BACKSLASH_V, 0);
1224        break;
1225
1226    case doBackslashW:
1227        fixLiterals(FALSE);
1228        appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1229        break;
1230
1231    case doBackslashw:
1232        fixLiterals(FALSE);
1233        appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1234        break;
1235
1236    case doBackslashX:
1237        fixLiterals(FALSE);
1238        appendOp(URX_BACKSLASH_X, 0);
1239        break;
1240
1241
1242    case doBackslashZ:
1243        fixLiterals(FALSE);
1244        appendOp(URX_DOLLAR, 0);
1245        break;
1246
1247    case doBackslashz:
1248        fixLiterals(FALSE);
1249        appendOp(URX_BACKSLASH_Z, 0);
1250        break;
1251
1252    case doEscapeError:
1253        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1254        break;
1255
1256    case doExit:
1257        fixLiterals(FALSE);
1258        returnVal = FALSE;
1259        break;
1260
1261    case doProperty:
1262        {
1263            fixLiterals(FALSE);
1264            UnicodeSet *theSet = scanProp();
1265            compileSet(theSet);
1266        }
1267        break;
1268
1269    case doNamedChar:
1270        {
1271            UChar32 c = scanNamedChar();
1272            literalChar(c);
1273        }
1274        break;
1275
1276
1277    case doBackRef:
1278        // BackReference.  Somewhat unusual in that the front-end can not completely parse
1279        //                 the regular expression, because the number of digits to be consumed
1280        //                 depends on the number of capture groups that have been defined.  So
1281        //                 we have to do it here instead.
1282        {
1283            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1284            int32_t  groupNum = 0;
1285            UChar32  c        = fC.fChar;
1286
1287            for (;;) {
1288                // Loop once per digit, for max allowed number of digits in a back reference.
1289                int32_t digit = u_charDigitValue(c);
1290                groupNum = groupNum * 10 + digit;
1291                if (groupNum >= numCaptureGroups) {
1292                    break;
1293                }
1294                c = peekCharLL();
1295                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1296                    break;
1297                }
1298                nextCharLL();
1299            }
1300
1301            // Scan of the back reference in the source regexp is complete.  Now generate
1302            //  the compiled code for it.
1303            // Because capture groups can be forward-referenced by back-references,
1304            //  we fill the operand with the capture group number.  At the end
1305            //  of compilation, it will be changed to the variable's location.
1306            U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1307                                     //    and shouldn't enter this code path at all.
1308            fixLiterals(FALSE);
1309            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1310                appendOp(URX_BACKREF_I, groupNum);
1311            } else {
1312                appendOp(URX_BACKREF, groupNum);
1313            }
1314        }
1315        break;
1316
1317    case doBeginNamedBackRef:
1318        U_ASSERT(fCaptureName == NULL);
1319        fCaptureName = new UnicodeString;
1320        if (fCaptureName == NULL) {
1321            error(U_MEMORY_ALLOCATION_ERROR);
1322        }
1323        break;
1324
1325    case doContinueNamedBackRef:
1326        fCaptureName->append(fC.fChar);
1327        break;
1328
1329    case doCompleteNamedBackRef:
1330        {
1331        int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1332        if (groupNumber == 0) {
1333            // Group name has not been defined.
1334            //   Could be a forward reference. If we choose to support them at some
1335            //   future time, extra mechanism will be required at this point.
1336            error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1337        } else {
1338            // Given the number, handle identically to a \n numbered back reference.
1339            // See comments above, under doBackRef
1340            fixLiterals(FALSE);
1341            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1342                appendOp(URX_BACKREF_I, groupNumber);
1343            } else {
1344                appendOp(URX_BACKREF, groupNumber);
1345            }
1346        }
1347        delete fCaptureName;
1348        fCaptureName = NULL;
1349        break;
1350        }
1351
1352    case doPossessivePlus:
1353        // Possessive ++ quantifier.
1354        // Compiles to
1355        //       1.   STO_SP
1356        //       2.      body of stuff being iterated over
1357        //       3.   STATE_SAVE 5
1358        //       4.   JMP        2
1359        //       5.   LD_SP
1360        //       6.   ...
1361        //
1362        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1363        //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1364        //
1365        {
1366            // Emit the STO_SP
1367            int32_t   topLoc = blockTopLoc(TRUE);
1368            int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1369            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1370            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1371
1372            // Emit the STATE_SAVE
1373            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1374
1375            // Emit the JMP
1376            appendOp(URX_JMP, topLoc+1);
1377
1378            // Emit the LD_SP
1379            appendOp(URX_LD_SP, stoLoc);
1380        }
1381        break;
1382
1383    case doPossessiveStar:
1384        // Possessive *+ quantifier.
1385        // Compiles to
1386        //       1.   STO_SP       loc
1387        //       2.   STATE_SAVE   5
1388        //       3.      body of stuff being iterated over
1389        //       4.   JMP          2
1390        //       5.   LD_SP        loc
1391        //       6    ...
1392        // TODO:  do something to cut back the state stack each time through the loop.
1393        {
1394            // Reserve two slots at the top of the block.
1395            int32_t   topLoc = blockTopLoc(TRUE);
1396            insertOp(topLoc);
1397
1398            // emit   STO_SP     loc
1399            int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1400            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1401            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1402
1403            // Emit the SAVE_STATE   5
1404            int32_t L7 = fRXPat->fCompiledPat->size()+1;
1405            op = buildOp(URX_STATE_SAVE, L7);
1406            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1407
1408            // Append the JMP operation.
1409            appendOp(URX_JMP, topLoc+1);
1410
1411            // Emit the LD_SP       loc
1412            appendOp(URX_LD_SP, stoLoc);
1413        }
1414        break;
1415
1416    case doPossessiveOpt:
1417        // Possessive  ?+ quantifier.
1418        //  Compiles to
1419        //     1. STO_SP      loc
1420        //     2. SAVE_STATE  5
1421        //     3.    body of optional block
1422        //     4. LD_SP       loc
1423        //     5. ...
1424        //
1425        {
1426            // Reserve two slots at the top of the block.
1427            int32_t   topLoc = blockTopLoc(TRUE);
1428            insertOp(topLoc);
1429
1430            // Emit the STO_SP
1431            int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1432            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1433            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1434
1435            // Emit the SAVE_STATE
1436            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1437            op = buildOp(URX_STATE_SAVE, continueLoc);
1438            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1439
1440            // Emit the LD_SP
1441            appendOp(URX_LD_SP, stoLoc);
1442        }
1443        break;
1444
1445
1446    case doBeginMatchMode:
1447        fNewModeFlags = fModeFlags;
1448        fSetModeFlag  = TRUE;
1449        break;
1450
1451    case doMatchMode:   //  (?i)    and similar
1452        {
1453            int32_t  bit = 0;
1454            switch (fC.fChar) {
1455            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1456            case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1457            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1458            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1459            case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1460            case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1461            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1462            case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1463            default:
1464                U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
1465                                   // by the scanner.
1466            }
1467            if (fSetModeFlag) {
1468                fNewModeFlags |= bit;
1469            } else {
1470                fNewModeFlags &= ~bit;
1471            }
1472        }
1473        break;
1474
1475    case doSetMatchMode:
1476        // Emit code to match any pending literals, using the not-yet changed match mode.
1477        fixLiterals();
1478
1479        // We've got a (?i) or similar.  The match mode is being changed, but
1480        //   the change is not scoped to a parenthesized block.
1481        U_ASSERT(fNewModeFlags < 0);
1482        fModeFlags = fNewModeFlags;
1483
1484        break;
1485
1486
1487    case doMatchModeParen:
1488        // We've got a (?i: or similar.  Begin a parenthesized block, save old
1489        //   mode flags so they can be restored at the close of the block.
1490        //
1491        //   Compile to a
1492        //      - NOP, which later may be replaced by a save-state if the
1493        //         parenthesized group gets a * quantifier, followed by
1494        //      - NOP, which may later be replaced by a save-state if there
1495        //             is an '|' alternation within the parens.
1496        {
1497            fixLiterals(FALSE);
1498            appendOp(URX_NOP, 0);
1499            appendOp(URX_NOP, 0);
1500
1501            // On the Parentheses stack, start a new frame and add the postions
1502            //   of the two NOPs (a normal non-capturing () frame, except for the
1503            //   saving of the orignal mode flags.)
1504            fParenStack.push(fModeFlags, *fStatus);
1505            fParenStack.push(flags, *fStatus);                            // Frame Marker
1506            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1507            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1508
1509            // Set the current mode flags to the new values.
1510            U_ASSERT(fNewModeFlags < 0);
1511            fModeFlags = fNewModeFlags;
1512        }
1513        break;
1514
1515    case doBadModeFlag:
1516        error(U_REGEX_INVALID_FLAG);
1517        break;
1518
1519    case doSuppressComments:
1520        // We have just scanned a '(?'.  We now need to prevent the character scanner from
1521        // treating a '#' as a to-the-end-of-line comment.
1522        //   (This Perl compatibility just gets uglier and uglier to do...)
1523        fEOLComments = FALSE;
1524        break;
1525
1526
1527    case doSetAddAmp:
1528        {
1529          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1530          set->add(chAmp);
1531        }
1532        break;
1533
1534    case doSetAddDash:
1535        {
1536          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1537          set->add(chDash);
1538        }
1539        break;
1540
1541     case doSetBackslash_s:
1542        {
1543         UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1544         set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1545         break;
1546        }
1547
1548     case doSetBackslash_S:
1549        {
1550            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1551            UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1552            SSet.complement();
1553            set->addAll(SSet);
1554            break;
1555        }
1556
1557    case doSetBackslash_d:
1558        {
1559            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1560            // TODO - make a static set, ticket 6058.
1561            addCategory(set, U_GC_ND_MASK, *fStatus);
1562            break;
1563        }
1564
1565    case doSetBackslash_D:
1566        {
1567            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1568            UnicodeSet digits;
1569            // TODO - make a static set, ticket 6058.
1570            digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1571            digits.complement();
1572            set->addAll(digits);
1573            break;
1574        }
1575
1576    case doSetBackslash_h:
1577        {
1578            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1579            UnicodeSet h;
1580            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1581            h.add((UChar32)9);   // Tab
1582            set->addAll(h);
1583            break;
1584        }
1585
1586    case doSetBackslash_H:
1587        {
1588            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1589            UnicodeSet h;
1590            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1591            h.add((UChar32)9);   // Tab
1592            h.complement();
1593            set->addAll(h);
1594            break;
1595        }
1596
1597    case doSetBackslash_v:
1598        {
1599            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1600            set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1601            set->add((UChar32)0x85);
1602            set->add((UChar32)0x2028, (UChar32)0x2029);
1603            break;
1604        }
1605
1606    case doSetBackslash_V:
1607        {
1608            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1609            UnicodeSet v;
1610            v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1611            v.add((UChar32)0x85);
1612            v.add((UChar32)0x2028, (UChar32)0x2029);
1613            v.complement();
1614            set->addAll(v);
1615            break;
1616        }
1617
1618    case doSetBackslash_w:
1619        {
1620            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1621            set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1622            break;
1623        }
1624
1625    case doSetBackslash_W:
1626        {
1627            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1628            UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1629            SSet.complement();
1630            set->addAll(SSet);
1631            break;
1632        }
1633
1634    case doSetBegin:
1635        fixLiterals(FALSE);
1636        fSetStack.push(new UnicodeSet(), *fStatus);
1637        fSetOpStack.push(setStart, *fStatus);
1638        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1639            fSetOpStack.push(setCaseClose, *fStatus);
1640        }
1641        break;
1642
1643    case doSetBeginDifference1:
1644        //  We have scanned something like [[abc]-[
1645        //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1646        //  Push a Difference operator, which will cause the new set to be subtracted from what
1647        //    went before once it is created.
1648        setPushOp(setDifference1);
1649        fSetOpStack.push(setStart, *fStatus);
1650        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1651            fSetOpStack.push(setCaseClose, *fStatus);
1652        }
1653        break;
1654
1655    case doSetBeginIntersection1:
1656        //  We have scanned something like  [[abc]&[
1657        //   Need both the '&' operator and the open '[' operator.
1658        setPushOp(setIntersection1);
1659        fSetOpStack.push(setStart, *fStatus);
1660        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1661            fSetOpStack.push(setCaseClose, *fStatus);
1662        }
1663        break;
1664
1665    case doSetBeginUnion:
1666        //  We have scanned something like  [[abc][
1667        //     Need to handle the union operation explicitly [[abc] | [
1668        setPushOp(setUnion);
1669        fSetOpStack.push(setStart, *fStatus);
1670        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1671            fSetOpStack.push(setCaseClose, *fStatus);
1672        }
1673        break;
1674
1675    case doSetDifference2:
1676        // We have scanned something like [abc--
1677        //   Consider this to unambiguously be a set difference operator.
1678        setPushOp(setDifference2);
1679        break;
1680
1681    case doSetEnd:
1682        // Have encountered the ']' that closes a set.
1683        //    Force the evaluation of any pending operations within this set,
1684        //    leave the completed set on the top of the set stack.
1685        setEval(setEnd);
1686        U_ASSERT(fSetOpStack.peeki()==setStart);
1687        fSetOpStack.popi();
1688        break;
1689
1690    case doSetFinish:
1691        {
1692        // Finished a complete set expression, including all nested sets.
1693        //   The close bracket has already triggered clearing out pending set operators,
1694        //    the operator stack should be empty and the operand stack should have just
1695        //    one entry, the result set.
1696        U_ASSERT(fSetOpStack.empty());
1697        UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1698        U_ASSERT(fSetStack.empty());
1699        compileSet(theSet);
1700        break;
1701        }
1702
1703    case doSetIntersection2:
1704        // Have scanned something like [abc&&
1705        setPushOp(setIntersection2);
1706        break;
1707
1708    case doSetLiteral:
1709        // Union the just-scanned literal character into the set being built.
1710        //    This operation is the highest precedence set operation, so we can always do
1711        //    it immediately, without waiting to see what follows.  It is necessary to perform
1712        //    any pending '-' or '&' operation first, because these have the same precedence
1713        //    as union-ing in a literal'
1714        {
1715            setEval(setUnion);
1716            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1717            s->add(fC.fChar);
1718            fLastSetLiteral = fC.fChar;
1719            break;
1720        }
1721
1722    case doSetLiteralEscaped:
1723        // A back-slash escaped literal character was encountered.
1724        // Processing is the same as with setLiteral, above, with the addition of
1725        //  the optional check for errors on escaped ASCII letters.
1726        {
1727            if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1728                ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1729                 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1730                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1731            }
1732            setEval(setUnion);
1733            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1734            s->add(fC.fChar);
1735            fLastSetLiteral = fC.fChar;
1736            break;
1737        }
1738
1739        case doSetNamedChar:
1740        // Scanning a \N{UNICODE CHARACTER NAME}
1741        //  Aside from the source of the character, the processing is identical to doSetLiteral,
1742        //    above.
1743        {
1744            UChar32  c = scanNamedChar();
1745            setEval(setUnion);
1746            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1747            s->add(c);
1748            fLastSetLiteral = c;
1749            break;
1750        }
1751
1752    case doSetNamedRange:
1753        // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1754        // The left character is already in the set, and is saved in fLastSetLiteral.
1755        // The right side needs to be picked up, the scan is at the 'N'.
1756        // Lower Limit > Upper limit being an error matches both Java
1757        //        and ICU UnicodeSet behavior.
1758        {
1759            UChar32  c = scanNamedChar();
1760            if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
1761                error(U_REGEX_INVALID_RANGE);
1762            }
1763            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1764            s->add(fLastSetLiteral, c);
1765            fLastSetLiteral = c;
1766            break;
1767        }
1768
1769
1770    case  doSetNegate:
1771        // Scanned a '^' at the start of a set.
1772        // Push the negation operator onto the set op stack.
1773        // A twist for case-insensitive matching:
1774        //   the case closure operation must happen _before_ negation.
1775        //   But the case closure operation will already be on the stack if it's required.
1776        //   This requires checking for case closure, and swapping the stack order
1777        //    if it is present.
1778        {
1779            int32_t  tosOp = fSetOpStack.peeki();
1780            if (tosOp == setCaseClose) {
1781                fSetOpStack.popi();
1782                fSetOpStack.push(setNegation, *fStatus);
1783                fSetOpStack.push(setCaseClose, *fStatus);
1784            } else {
1785                fSetOpStack.push(setNegation, *fStatus);
1786            }
1787        }
1788        break;
1789
1790    case doSetNoCloseError:
1791        error(U_REGEX_MISSING_CLOSE_BRACKET);
1792        break;
1793
1794    case doSetOpError:
1795        error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1796        break;
1797
1798    case doSetPosixProp:
1799        {
1800            UnicodeSet *s = scanPosixProp();
1801            if (s != NULL) {
1802                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1803                tos->addAll(*s);
1804                delete s;
1805            }  // else error.  scanProp() reported the error status already.
1806        }
1807        break;
1808
1809    case doSetProp:
1810        //  Scanned a \p \P within [brackets].
1811        {
1812            UnicodeSet *s = scanProp();
1813            if (s != NULL) {
1814                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1815                tos->addAll(*s);
1816                delete s;
1817            }  // else error.  scanProp() reported the error status already.
1818        }
1819        break;
1820
1821
1822    case doSetRange:
1823        // We have scanned literal-literal.  Add the range to the set.
1824        // The left character is already in the set, and is saved in fLastSetLiteral.
1825        // The right side is the current character.
1826        // Lower Limit > Upper limit being an error matches both Java
1827        //        and ICU UnicodeSet behavior.
1828        {
1829        if (fLastSetLiteral > fC.fChar) {
1830            error(U_REGEX_INVALID_RANGE);
1831        }
1832        UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1833        s->add(fLastSetLiteral, fC.fChar);
1834        break;
1835        }
1836
1837    default:
1838        U_ASSERT(FALSE);
1839        error(U_REGEX_INTERNAL_ERROR);
1840        break;
1841    }
1842
1843    if (U_FAILURE(*fStatus)) {
1844        returnVal = FALSE;
1845    }
1846
1847    return returnVal;
1848}
1849
1850
1851
1852//------------------------------------------------------------------------------
1853//
1854//   literalChar           We've encountered a literal character from the pattern,
1855//                             or an escape sequence that reduces to a character.
1856//                         Add it to the string containing all literal chars/strings from
1857//                             the pattern.
1858//
1859//------------------------------------------------------------------------------
1860void RegexCompile::literalChar(UChar32 c)  {
1861    fLiteralChars.append(c);
1862}
1863
1864
1865//------------------------------------------------------------------------------
1866//
1867//    fixLiterals           When compiling something that can follow a literal
1868//                          string in a pattern, emit the code to match the
1869//                          accumulated literal string.
1870//
1871//                          Optionally, split the last char of the string off into
1872//                          a single "ONE_CHAR" operation, so that quantifiers can
1873//                          apply to that char alone.  Example:   abc*
1874//                          The * must apply to the 'c' only.
1875//
1876//------------------------------------------------------------------------------
1877void    RegexCompile::fixLiterals(UBool split) {
1878
1879    // If no literal characters have been scanned but not yet had code generated
1880    //   for them, nothing needs to be done.
1881    if (fLiteralChars.length() == 0) {
1882        return;
1883    }
1884
1885    int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1886    UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1887
1888    // Split:  We need to  ensure that the last item in the compiled pattern
1889    //     refers only to the last literal scanned in the pattern, so that
1890    //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1891    //     Split before case folding for case insensitive matches.
1892
1893    if (split) {
1894        fLiteralChars.truncate(indexOfLastCodePoint);
1895        fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1896                              //  Note that the truncated literal string may be empty, in which case
1897                              //  nothing will be emitted.
1898
1899        literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1900        fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1901        return;
1902    }
1903
1904    // If we are doing case-insensitive matching, case fold the string.  This may expand
1905    //   the string, e.g. the German sharp-s turns into "ss"
1906    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1907        fLiteralChars.foldCase();
1908        indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1909        lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1910    }
1911
1912    if (indexOfLastCodePoint == 0) {
1913        // Single character, emit a URX_ONECHAR op to match it.
1914        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1915                 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1916            appendOp(URX_ONECHAR_I, lastCodePoint);
1917        } else {
1918            appendOp(URX_ONECHAR, lastCodePoint);
1919        }
1920    } else {
1921        // Two or more chars, emit a URX_STRING to match them.
1922        if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1923            error(U_REGEX_PATTERN_TOO_BIG);
1924        }
1925        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1926            appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1927        } else {
1928            // TODO here:  add optimization to split case sensitive strings of length two
1929            //             into two single char ops, for efficiency.
1930            appendOp(URX_STRING, fRXPat->fLiteralText.length());
1931        }
1932        appendOp(URX_STRING_LEN, fLiteralChars.length());
1933
1934        // Add this string into the accumulated strings of the compiled pattern.
1935        fRXPat->fLiteralText.append(fLiteralChars);
1936    }
1937
1938    fLiteralChars.remove();
1939}
1940
1941
1942int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1943    if (U_FAILURE(*fStatus)) {
1944        return 0;
1945    }
1946    if (type < 0 || type > 255) {
1947        U_ASSERT(FALSE);
1948        error(U_REGEX_INTERNAL_ERROR);
1949        type = URX_RESERVED_OP;
1950    }
1951    if (val > 0x00ffffff) {
1952        U_ASSERT(FALSE);
1953        error(U_REGEX_INTERNAL_ERROR);
1954        val = 0;
1955    }
1956    if (val < 0) {
1957        if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1958            U_ASSERT(FALSE);
1959            error(U_REGEX_INTERNAL_ERROR);
1960            return -1;
1961        }
1962        if (URX_TYPE(val) != 0xff) {
1963            U_ASSERT(FALSE);
1964            error(U_REGEX_INTERNAL_ERROR);
1965            return -1;
1966        }
1967        type = URX_RESERVED_OP_N;
1968    }
1969    return (type << 24) | val;
1970}
1971
1972
1973//------------------------------------------------------------------------------
1974//
1975//   appendOp()             Append a new instruction onto the compiled pattern
1976//                          Includes error checking, limiting the size of the
1977//                          pattern to lengths that can be represented in the
1978//                          24 bit operand field of an instruction.
1979//
1980//------------------------------------------------------------------------------
1981void RegexCompile::appendOp(int32_t op) {
1982    if (U_FAILURE(*fStatus)) {
1983        return;
1984    }
1985    fRXPat->fCompiledPat->addElement(op, *fStatus);
1986    if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1987        error(U_REGEX_PATTERN_TOO_BIG);
1988    }
1989}
1990
1991void RegexCompile::appendOp(int32_t type, int32_t val) {
1992    appendOp(buildOp(type, val));
1993}
1994
1995
1996//------------------------------------------------------------------------------
1997//
1998//   insertOp()             Insert a slot for a new opcode into the already
1999//                          compiled pattern code.
2000//
2001//                          Fill the slot with a NOP.  Our caller will replace it
2002//                          with what they really wanted.
2003//
2004//------------------------------------------------------------------------------
2005void   RegexCompile::insertOp(int32_t where) {
2006    UVector64 *code = fRXPat->fCompiledPat;
2007    U_ASSERT(where>0 && where < code->size());
2008
2009    int32_t  nop = buildOp(URX_NOP, 0);
2010    code->insertElementAt(nop, where, *fStatus);
2011
2012    // Walk through the pattern, looking for any ops with targets that
2013    //  were moved down by the insert.  Fix them.
2014    int32_t loc;
2015    for (loc=0; loc<code->size(); loc++) {
2016        int32_t op = (int32_t)code->elementAti(loc);
2017        int32_t opType = URX_TYPE(op);
2018        int32_t opValue = URX_VAL(op);
2019        if ((opType == URX_JMP         ||
2020            opType == URX_JMPX         ||
2021            opType == URX_STATE_SAVE   ||
2022            opType == URX_CTR_LOOP     ||
2023            opType == URX_CTR_LOOP_NG  ||
2024            opType == URX_JMP_SAV      ||
2025            opType == URX_JMP_SAV_X    ||
2026            opType == URX_RELOC_OPRND)    && opValue > where) {
2027            // Target location for this opcode is after the insertion point and
2028            //   needs to be incremented to adjust for the insertion.
2029            opValue++;
2030            op = buildOp(opType, opValue);
2031            code->setElementAt(op, loc);
2032        }
2033    }
2034
2035    // Now fix up the parentheses stack.  All positive values in it are locations in
2036    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2037    for (loc=0; loc<fParenStack.size(); loc++) {
2038        int32_t x = fParenStack.elementAti(loc);
2039        U_ASSERT(x < code->size());
2040        if (x>where) {
2041            x++;
2042            fParenStack.setElementAt(x, loc);
2043        }
2044    }
2045
2046    if (fMatchCloseParen > where) {
2047        fMatchCloseParen++;
2048    }
2049    if (fMatchOpenParen > where) {
2050        fMatchOpenParen++;
2051    }
2052}
2053
2054
2055//------------------------------------------------------------------------------
2056//
2057//   allocateData()        Allocate storage in the matcher's static data area.
2058//                         Return the index for the newly allocated data.
2059//                         The storage won't actually exist until we are running a match
2060//                         operation, but the storage indexes are inserted into various
2061//                         opcodes while compiling the pattern.
2062//
2063//------------------------------------------------------------------------------
2064int32_t RegexCompile::allocateData(int32_t size) {
2065    if (U_FAILURE(*fStatus)) {
2066        return 0;
2067    }
2068    if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2069        error(U_REGEX_INTERNAL_ERROR);
2070        return 0;
2071    }
2072    int32_t dataIndex = fRXPat->fDataSize;
2073    fRXPat->fDataSize += size;
2074    if (fRXPat->fDataSize >= 0x00fffff0) {
2075        error(U_REGEX_INTERNAL_ERROR);
2076    }
2077    return dataIndex;
2078}
2079
2080
2081//------------------------------------------------------------------------------
2082//
2083//   allocateStackData()   Allocate space in the back-tracking stack frame.
2084//                         Return the index for the newly allocated data.
2085//                         The frame indexes are inserted into various
2086//                         opcodes while compiling the pattern, meaning that frame
2087//                         size must be restricted to the size that will fit
2088//                         as an operand (24 bits).
2089//
2090//------------------------------------------------------------------------------
2091int32_t RegexCompile::allocateStackData(int32_t size) {
2092    if (U_FAILURE(*fStatus)) {
2093        return 0;
2094    }
2095    if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2096        error(U_REGEX_INTERNAL_ERROR);
2097        return 0;
2098    }
2099    int32_t dataIndex = fRXPat->fFrameSize;
2100    fRXPat->fFrameSize += size;
2101    if (fRXPat->fFrameSize >= 0x00fffff0) {
2102        error(U_REGEX_PATTERN_TOO_BIG);
2103    }
2104    return dataIndex;
2105}
2106
2107
2108//------------------------------------------------------------------------------
2109//
2110//   blockTopLoc()          Find or create a location in the compiled pattern
2111//                          at the start of the operation or block that has
2112//                          just been compiled.  Needed when a quantifier (* or
2113//                          whatever) appears, and we need to add an operation
2114//                          at the start of the thing being quantified.
2115//
2116//                          (Parenthesized Blocks) have a slot with a NOP that
2117//                          is reserved for this purpose.  .* or similar don't
2118//                          and a slot needs to be added.
2119//
2120//       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2121//                                         at the returned location.
2122//                                 FALSE - just return the address,
2123//                                         do not reserve a location there.
2124//
2125//------------------------------------------------------------------------------
2126int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2127    int32_t   theLoc;
2128    fixLiterals(TRUE);  // Emit code for any pending literals.
2129                        //   If last item was a string, emit separate op for the its last char.
2130    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2131    {
2132        // The item just processed is a parenthesized block.
2133        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2134        U_ASSERT(theLoc > 0);
2135        U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2136    }
2137    else {
2138        // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2139        // No slot for STATE_SAVE was pre-reserved in the compiled code.
2140        // We need to make space now.
2141        theLoc = fRXPat->fCompiledPat->size()-1;
2142        int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2143        if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2144            // Strings take two opcode, we want the position of the first one.
2145            // We can have a string at this point if a single character case-folded to two.
2146            theLoc--;
2147        }
2148        if (reserveLoc) {
2149            int32_t  nop = buildOp(URX_NOP, 0);
2150            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2151        }
2152    }
2153    return theLoc;
2154}
2155
2156
2157
2158//------------------------------------------------------------------------------
2159//
2160//    handleCloseParen      When compiling a close paren, we need to go back
2161//                          and fix up any JMP or SAVE operations within the
2162//                          parenthesized block that need to target the end
2163//                          of the block.  The locations of these are kept on
2164//                          the paretheses stack.
2165//
2166//                          This function is called both when encountering a
2167//                          real ) and at the end of the pattern.
2168//
2169//------------------------------------------------------------------------------
2170void  RegexCompile::handleCloseParen() {
2171    int32_t   patIdx;
2172    int32_t   patOp;
2173    if (fParenStack.size() <= 0) {
2174        error(U_REGEX_MISMATCHED_PAREN);
2175        return;
2176    }
2177
2178    // Emit code for any pending literals.
2179    fixLiterals(FALSE);
2180
2181    // Fixup any operations within the just-closed parenthesized group
2182    //    that need to reference the end of the (block).
2183    //    (The first one popped from the stack is an unused slot for
2184    //     alternation (OR) state save, but applying the fixup to it does no harm.)
2185    for (;;) {
2186        patIdx = fParenStack.popi();
2187        if (patIdx < 0) {
2188            // value < 0 flags the start of the frame on the paren stack.
2189            break;
2190        }
2191        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2192        patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2193        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2194        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2195        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2196        fMatchOpenParen     = patIdx;
2197    }
2198
2199    //  At the close of any parenthesized block, restore the match mode flags  to
2200    //  the value they had at the open paren.  Saved value is
2201    //  at the top of the paren stack.
2202    fModeFlags = fParenStack.popi();
2203    U_ASSERT(fModeFlags < 0);
2204
2205    // DO any additional fixups, depending on the specific kind of
2206    // parentesized grouping this is
2207
2208    switch (patIdx) {
2209    case plain:
2210    case flags:
2211        // No additional fixups required.
2212        //   (Grouping-only parentheses)
2213        break;
2214    case capturing:
2215        // Capturing Parentheses.
2216        //   Insert a End Capture op into the pattern.
2217        //   The frame offset of the variables for this cg is obtained from the
2218        //       start capture op and put it into the end-capture op.
2219        {
2220            int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2221            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2222
2223            int32_t   frameVarLocation = URX_VAL(captureOp);
2224            appendOp(URX_END_CAPTURE, frameVarLocation);
2225        }
2226        break;
2227    case atomic:
2228        // Atomic Parenthesis.
2229        //   Insert a LD_SP operation to restore the state stack to the position
2230        //   it was when the atomic parens were entered.
2231        {
2232            int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2233            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2234            int32_t   stoLoc = URX_VAL(stoOp);
2235            appendOp(URX_LD_SP, stoLoc);
2236        }
2237        break;
2238
2239    case lookAhead:
2240        {
2241            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2242            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2243            int32_t dataLoc  = URX_VAL(startOp);
2244            appendOp(URX_LA_END, dataLoc);
2245        }
2246        break;
2247
2248    case negLookAhead:
2249        {
2250            // See comment at doOpenLookAheadNeg
2251            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2252            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2253            int32_t dataLoc  = URX_VAL(startOp);
2254            appendOp(URX_LA_END, dataLoc);
2255            appendOp(URX_BACKTRACK, 0);
2256            appendOp(URX_LA_END, dataLoc);
2257
2258            // Patch the URX_SAVE near the top of the block.
2259            // The destination of the SAVE is the final LA_END that was just added.
2260            int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2261            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2262            int32_t dest     = fRXPat->fCompiledPat->size()-1;
2263            saveOp           = buildOp(URX_STATE_SAVE, dest);
2264            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2265        }
2266        break;
2267
2268    case lookBehind:
2269        {
2270            // See comment at doOpenLookBehind.
2271
2272            // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2273            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2274            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2275            int32_t dataLoc  = URX_VAL(startOp);
2276            appendOp(URX_LB_END, dataLoc);
2277            appendOp(URX_LA_END, dataLoc);
2278
2279            // Determine the min and max bounds for the length of the
2280            //  string that the pattern can match.
2281            //  An unbounded upper limit is an error.
2282            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2283            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2284            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2285            if (URX_TYPE(maxML) != 0) {
2286                error(U_REGEX_LOOK_BEHIND_LIMIT);
2287                break;
2288            }
2289            if (maxML == INT32_MAX) {
2290                error(U_REGEX_LOOK_BEHIND_LIMIT);
2291                break;
2292            }
2293            U_ASSERT(minML <= maxML);
2294
2295            // Insert the min and max match len bounds into the URX_LB_CONT op that
2296            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2297            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2298            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2299
2300        }
2301        break;
2302
2303
2304
2305    case lookBehindN:
2306        {
2307            // See comment at doOpenLookBehindNeg.
2308
2309            // Append the URX_LBN_END to the compiled pattern.
2310            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2311            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2312            int32_t dataLoc  = URX_VAL(startOp);
2313            appendOp(URX_LBN_END, dataLoc);
2314
2315            // Determine the min and max bounds for the length of the
2316            //  string that the pattern can match.
2317            //  An unbounded upper limit is an error.
2318            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2319            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2320            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2321            if (URX_TYPE(maxML) != 0) {
2322                error(U_REGEX_LOOK_BEHIND_LIMIT);
2323                break;
2324            }
2325            if (maxML == INT32_MAX) {
2326                error(U_REGEX_LOOK_BEHIND_LIMIT);
2327                break;
2328            }
2329            U_ASSERT(minML <= maxML);
2330
2331            // Insert the min and max match len bounds into the URX_LB_CONT op that
2332            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2333            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2334            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2335
2336            // Insert the pattern location to continue at after a successful match
2337            //  as the last operand of the URX_LBN_CONT
2338            int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2339            fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2340        }
2341        break;
2342
2343
2344
2345    default:
2346        U_ASSERT(FALSE);
2347    }
2348
2349    // remember the next location in the compiled pattern.
2350    // The compilation of Quantifiers will look at this to see whether its looping
2351    //   over a parenthesized block or a single item
2352    fMatchCloseParen = fRXPat->fCompiledPat->size();
2353}
2354
2355
2356
2357//------------------------------------------------------------------------------
2358//
2359//   compileSet       Compile the pattern operations for a reference to a
2360//                    UnicodeSet.
2361//
2362//------------------------------------------------------------------------------
2363void        RegexCompile::compileSet(UnicodeSet *theSet)
2364{
2365    if (theSet == NULL) {
2366        return;
2367    }
2368    //  Remove any strings from the set.
2369    //  There shoudn't be any, but just in case.
2370    //     (Case Closure can add them; if we had a simple case closure avaialble that
2371    //      ignored strings, that would be better.)
2372    theSet->removeAllStrings();
2373    int32_t  setSize = theSet->size();
2374
2375    switch (setSize) {
2376    case 0:
2377        {
2378            // Set of no elements.   Always fails to match.
2379            appendOp(URX_BACKTRACK, 0);
2380            delete theSet;
2381        }
2382        break;
2383
2384    case 1:
2385        {
2386            // The set contains only a single code point.  Put it into
2387            //   the compiled pattern as a single char operation rather
2388            //   than a set, and discard the set itself.
2389            literalChar(theSet->charAt(0));
2390            delete theSet;
2391        }
2392        break;
2393
2394    default:
2395        {
2396            //  The set contains two or more chars.  (the normal case)
2397            //  Put it into the compiled pattern as a set.
2398            int32_t setNumber = fRXPat->fSets->size();
2399            fRXPat->fSets->addElement(theSet, *fStatus);
2400            appendOp(URX_SETREF, setNumber);
2401        }
2402    }
2403}
2404
2405
2406//------------------------------------------------------------------------------
2407//
2408//   compileInterval    Generate the code for a {min, max} style interval quantifier.
2409//                      Except for the specific opcodes used, the code is the same
2410//                      for all three types (greedy, non-greedy, possessive) of
2411//                      intervals.  The opcodes are supplied as parameters.
2412//                      (There are two sets of opcodes - greedy & possessive use the
2413//                      same ones, while non-greedy has it's own.)
2414//
2415//                      The code for interval loops has this form:
2416//                         0  CTR_INIT   counter loc (in stack frame)
2417//                         1             5  patt address of CTR_LOOP at bottom of block
2418//                         2             min count
2419//                         3             max count   (-1 for unbounded)
2420//                         4  ...        block to be iterated over
2421//                         5  CTR_LOOP
2422//
2423//                       In
2424//------------------------------------------------------------------------------
2425void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2426{
2427    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2428    //   four slots in the compiled code.  Reserve them.
2429    int32_t   topOfBlock = blockTopLoc(TRUE);
2430    insertOp(topOfBlock);
2431    insertOp(topOfBlock);
2432    insertOp(topOfBlock);
2433
2434    // The operands for the CTR_INIT opcode include the index in the matcher data
2435    //   of the counter.  Allocate it now. There are two data items
2436    //        counterLoc   -->  Loop counter
2437    //               +1    -->  Input index (for breaking non-progressing loops)
2438    //                          (Only present if unbounded upper limit on loop)
2439    int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2440    int32_t   counterLoc = allocateStackData(dataSize);
2441
2442    int32_t   op = buildOp(InitOp, counterLoc);
2443    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2444
2445    // The second operand of CTR_INIT is the location following the end of the loop.
2446    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2447    //   compilation of something later on causes the code to grow and the target
2448    //   position to move.
2449    int32_t loopEnd = fRXPat->fCompiledPat->size();
2450    op = buildOp(URX_RELOC_OPRND, loopEnd);
2451    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2452
2453    // Followed by the min and max counts.
2454    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2455    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2456
2457    // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2458    //   Goes at end of the block being looped over, so just append to the code so far.
2459    appendOp(LoopOp, topOfBlock);
2460
2461    if ((fIntervalLow & 0xff000000) != 0 ||
2462        (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2463            error(U_REGEX_NUMBER_TOO_BIG);
2464        }
2465
2466    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2467        error(U_REGEX_MAX_LT_MIN);
2468    }
2469}
2470
2471
2472
2473UBool RegexCompile::compileInlineInterval() {
2474    if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2475        // Too big to inline.  Fail, which will cause looping code to be generated.
2476        //   (Upper < Lower picks up unbounded upper and errors, both.)
2477        return FALSE;
2478    }
2479
2480    int32_t   topOfBlock = blockTopLoc(FALSE);
2481    if (fIntervalUpper == 0) {
2482        // Pathological case.  Attempt no matches, as if the block doesn't exist.
2483        // Discard the generated code for the block.
2484        // If the block included parens, discard the info pertaining to them as well.
2485        fRXPat->fCompiledPat->setSize(topOfBlock);
2486        if (fMatchOpenParen >= topOfBlock) {
2487            fMatchOpenParen = -1;
2488        }
2489        if (fMatchCloseParen >= topOfBlock) {
2490            fMatchCloseParen = -1;
2491        }
2492        return TRUE;
2493    }
2494
2495    if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2496        // The thing being repeated is not a single op, but some
2497        //   more complex block.  Do it as a loop, not inlines.
2498        //   Note that things "repeated" a max of once are handled as inline, because
2499        //     the one copy of the code already generated is just fine.
2500        return FALSE;
2501    }
2502
2503    // Pick up the opcode that is to be repeated
2504    //
2505    int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2506
2507    // Compute the pattern location where the inline sequence
2508    //   will end, and set up the state save op that will be needed.
2509    //
2510    int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2511                                + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2512    int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2513    if (fIntervalLow == 0) {
2514        insertOp(topOfBlock);
2515        fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2516    }
2517
2518
2519
2520    //  Loop, emitting the op for the thing being repeated each time.
2521    //    Loop starts at 1 because one instance of the op already exists in the pattern,
2522    //    it was put there when it was originally encountered.
2523    int32_t i;
2524    for (i=1; i<fIntervalUpper; i++ ) {
2525        if (i >= fIntervalLow) {
2526            appendOp(saveOp);
2527        }
2528        appendOp(op);
2529    }
2530    return TRUE;
2531}
2532
2533
2534
2535//------------------------------------------------------------------------------
2536//
2537//   caseInsensitiveStart  given a single code point from a pattern string, determine the
2538//                         set of characters that could potentially begin a case-insensitive
2539//                         match of a string beginning with that character, using full Unicode
2540//                         case insensitive matching.
2541//
2542//          This is used in optimizing find().
2543//
2544//          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2545//          misses cases like this:
2546//             A string from the pattern begins with 'ss' (although all we know
2547//                 in this context is that it begins with 's')
2548//             The pattern could match a string beginning with a German sharp-s
2549//
2550//           To the ordinary case closure for a character c, we add all other
2551//           characters cx where the case closure of cx incudes a string form that begins
2552//           with the original character c.
2553//
2554//           This function could be made smarter. The full pattern string is available
2555//           and it would be possible to verify that the extra characters being added
2556//           to the starting set fully match, rather than having just a first-char of the
2557//           folded form match.
2558//
2559//------------------------------------------------------------------------------
2560void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2561
2562// Machine Generated below.
2563// It may need updating with new versions of Unicode.
2564// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2565// The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2566
2567// Machine Generated Data. Do not hand edit.
2568    static const UChar32 RECaseFixCodePoints[] = {
2569        0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2570        0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2571        0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2572        0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2573        0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2574
2575    static const int16_t RECaseFixStringOffsets[] = {
2576        0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2577        0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2578        0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2579        0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2580        0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2581
2582    static const int16_t RECaseFixCounts[] = {
2583        0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2584        0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2585        0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2586        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2587        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2588
2589    static const UChar RECaseFixData[] = {
2590        0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2591        0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2592        0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2593        0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2594        0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2595        0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2596        0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2597        0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2598        0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2599        0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2600        0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2601
2602// End of machine generated data.
2603
2604    if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2605        UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2606        starterChars->set(caseFoldedC, caseFoldedC);
2607
2608        int32_t i;
2609        for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2610            // Simple linear search through the sorted list of interesting code points.
2611        }
2612
2613        if (RECaseFixCodePoints[i] == c) {
2614            int32_t dataIndex = RECaseFixStringOffsets[i];
2615            int32_t numCharsToAdd = RECaseFixCounts[i];
2616            UChar32 cpToAdd = 0;
2617            for (int32_t j=0; j<numCharsToAdd; j++) {
2618                U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2619                starterChars->add(cpToAdd);
2620            }
2621        }
2622
2623        starterChars->closeOver(USET_CASE_INSENSITIVE);
2624        starterChars->removeAllStrings();
2625    } else {
2626        // Not a cased character. Just return it alone.
2627        starterChars->set(c, c);
2628    }
2629}
2630
2631
2632
2633
2634//------------------------------------------------------------------------------
2635//
2636//   matchStartType    Determine how a match can start.
2637//                     Used to optimize find() operations.
2638//
2639//                     Operation is very similar to minMatchLength().  Walk the compiled
2640//                     pattern, keeping an on-going minimum-match-length.  For any
2641//                     op where the min match coming in is zero, add that ops possible
2642//                     starting matches to the possible starts for the overall pattern.
2643//
2644//------------------------------------------------------------------------------
2645void   RegexCompile::matchStartType() {
2646    if (U_FAILURE(*fStatus)) {
2647        return;
2648    }
2649
2650
2651    int32_t    loc;                    // Location in the pattern of the current op being processed.
2652    int32_t    op;                     // The op being processed
2653    int32_t    opType;                 // The opcode type of the op
2654    int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2655    int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2656
2657    UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2658                                       //   could have advanced the position in a match.
2659                                       //   (Maximum match length so far == 0)
2660
2661    // forwardedLength is a vector holding minimum-match-length values that
2662    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2663    //   It must be one longer than the pattern being checked because some  ops
2664    //   will jmp to a end-of-block+1 location from within a block, and we must
2665    //   count those when checking the block.
2666    int32_t end = fRXPat->fCompiledPat->size();
2667    UVector32  forwardedLength(end+1, *fStatus);
2668    forwardedLength.setSize(end+1);
2669    for (loc=3; loc<end; loc++) {
2670        forwardedLength.setElementAt(INT32_MAX, loc);
2671    }
2672
2673    for (loc = 3; loc<end; loc++) {
2674        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2675        opType = URX_TYPE(op);
2676
2677        // The loop is advancing linearly through the pattern.
2678        // If the op we are now at was the destination of a branch in the pattern,
2679        // and that path has a shorter minimum length than the current accumulated value,
2680        // replace the current accumulated value.
2681        if (forwardedLength.elementAti(loc) < currentLen) {
2682            currentLen = forwardedLength.elementAti(loc);
2683            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2684        }
2685
2686        switch (opType) {
2687            // Ops that don't change the total length matched
2688        case URX_RESERVED_OP:
2689        case URX_END:
2690        case URX_FAIL:
2691        case URX_STRING_LEN:
2692        case URX_NOP:
2693        case URX_START_CAPTURE:
2694        case URX_END_CAPTURE:
2695        case URX_BACKSLASH_B:
2696        case URX_BACKSLASH_BU:
2697        case URX_BACKSLASH_G:
2698        case URX_BACKSLASH_Z:
2699        case URX_DOLLAR:
2700        case URX_DOLLAR_M:
2701        case URX_DOLLAR_D:
2702        case URX_DOLLAR_MD:
2703        case URX_RELOC_OPRND:
2704        case URX_STO_INP_LOC:
2705        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2706        case URX_BACKREF_I:
2707
2708        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2709        case URX_LD_SP:
2710            break;
2711
2712        case URX_CARET:
2713            if (atStart) {
2714                fRXPat->fStartType = START_START;
2715            }
2716            break;
2717
2718        case URX_CARET_M:
2719        case URX_CARET_M_UNIX:
2720            if (atStart) {
2721                fRXPat->fStartType = START_LINE;
2722            }
2723            break;
2724
2725        case URX_ONECHAR:
2726            if (currentLen == 0) {
2727                // This character could appear at the start of a match.
2728                //   Add it to the set of possible starting characters.
2729                fRXPat->fInitialChars->add(URX_VAL(op));
2730                numInitialStrings += 2;
2731            }
2732            currentLen++;
2733            atStart = FALSE;
2734            break;
2735
2736
2737        case URX_SETREF:
2738            if (currentLen == 0) {
2739                int32_t  sn = URX_VAL(op);
2740                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2741                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2742                fRXPat->fInitialChars->addAll(*s);
2743                numInitialStrings += 2;
2744            }
2745            currentLen++;
2746            atStart = FALSE;
2747            break;
2748
2749        case URX_LOOP_SR_I:
2750            // [Set]*, like a SETREF, above, in what it can match,
2751            //  but may not match at all, so currentLen is not incremented.
2752            if (currentLen == 0) {
2753                int32_t  sn = URX_VAL(op);
2754                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2755                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2756                fRXPat->fInitialChars->addAll(*s);
2757                numInitialStrings += 2;
2758            }
2759            atStart = FALSE;
2760            break;
2761
2762        case URX_LOOP_DOT_I:
2763            if (currentLen == 0) {
2764                // .* at the start of a pattern.
2765                //    Any character can begin the match.
2766                fRXPat->fInitialChars->clear();
2767                fRXPat->fInitialChars->complement();
2768                numInitialStrings += 2;
2769            }
2770            atStart = FALSE;
2771            break;
2772
2773
2774        case URX_STATIC_SETREF:
2775            if (currentLen == 0) {
2776                int32_t  sn = URX_VAL(op);
2777                U_ASSERT(sn>0 && sn<URX_LAST_SET);
2778                const UnicodeSet *s = fRXPat->fStaticSets[sn];
2779                fRXPat->fInitialChars->addAll(*s);
2780                numInitialStrings += 2;
2781            }
2782            currentLen++;
2783            atStart = FALSE;
2784            break;
2785
2786
2787
2788        case URX_STAT_SETREF_N:
2789            if (currentLen == 0) {
2790                int32_t  sn = URX_VAL(op);
2791                const UnicodeSet *s = fRXPat->fStaticSets[sn];
2792                UnicodeSet sc(*s);
2793                sc.complement();
2794                fRXPat->fInitialChars->addAll(sc);
2795                numInitialStrings += 2;
2796            }
2797            currentLen++;
2798            atStart = FALSE;
2799            break;
2800
2801
2802
2803        case URX_BACKSLASH_D:
2804            // Digit Char
2805             if (currentLen == 0) {
2806                 UnicodeSet s;
2807                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2808                 if (URX_VAL(op) != 0) {
2809                     s.complement();
2810                 }
2811                 fRXPat->fInitialChars->addAll(s);
2812                 numInitialStrings += 2;
2813            }
2814            currentLen++;
2815            atStart = FALSE;
2816            break;
2817
2818
2819        case URX_BACKSLASH_H:
2820            // Horiz white space
2821            if (currentLen == 0) {
2822                UnicodeSet s;
2823                s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2824                s.add((UChar32)9);   // Tab
2825                if (URX_VAL(op) != 0) {
2826                    s.complement();
2827                }
2828                fRXPat->fInitialChars->addAll(s);
2829                numInitialStrings += 2;
2830            }
2831            currentLen++;
2832            atStart = FALSE;
2833            break;
2834
2835
2836        case URX_BACKSLASH_R:       // Any line ending sequence
2837        case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2838            if (currentLen == 0) {
2839                UnicodeSet s;
2840                s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2841                s.add((UChar32)0x85);
2842                s.add((UChar32)0x2028, (UChar32)0x2029);
2843                if (URX_VAL(op) != 0) {
2844                     // Complement option applies to URX_BACKSLASH_V only.
2845                     s.complement();
2846                }
2847                fRXPat->fInitialChars->addAll(s);
2848                numInitialStrings += 2;
2849            }
2850            currentLen++;
2851            atStart = FALSE;
2852            break;
2853
2854
2855
2856        case URX_ONECHAR_I:
2857            // Case Insensitive Single Character.
2858            if (currentLen == 0) {
2859                UChar32  c = URX_VAL(op);
2860                if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2861                    UnicodeSet starters(c, c);
2862                    starters.closeOver(USET_CASE_INSENSITIVE);
2863                    // findCaseInsensitiveStarters(c, &starters);
2864                    //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2865                    //   The expanded folding can't match the pattern.
2866                    fRXPat->fInitialChars->addAll(starters);
2867                } else {
2868                    // Char has no case variants.  Just add it as-is to the
2869                    //   set of possible starting chars.
2870                    fRXPat->fInitialChars->add(c);
2871                }
2872                numInitialStrings += 2;
2873            }
2874            currentLen++;
2875            atStart = FALSE;
2876            break;
2877
2878
2879        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2880        case URX_DOTANY_ALL:    // . matches one or two.
2881        case URX_DOTANY:
2882        case URX_DOTANY_UNIX:
2883            if (currentLen == 0) {
2884                // These constructs are all bad news when they appear at the start
2885                //   of a match.  Any character can begin the match.
2886                fRXPat->fInitialChars->clear();
2887                fRXPat->fInitialChars->complement();
2888                numInitialStrings += 2;
2889            }
2890            currentLen++;
2891            atStart = FALSE;
2892            break;
2893
2894
2895        case URX_JMPX:
2896            loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2897        case URX_JMP:
2898            {
2899                int32_t  jmpDest = URX_VAL(op);
2900                if (jmpDest < loc) {
2901                    // Loop of some kind.  Can safely ignore, the worst that will happen
2902                    //  is that we understate the true minimum length
2903                    currentLen = forwardedLength.elementAti(loc+1);
2904
2905                } else {
2906                    // Forward jump.  Propagate the current min length to the target loc of the jump.
2907                    U_ASSERT(jmpDest <= end+1);
2908                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
2909                        forwardedLength.setElementAt(currentLen, jmpDest);
2910                    }
2911                }
2912            }
2913            atStart = FALSE;
2914            break;
2915
2916        case URX_JMP_SAV:
2917        case URX_JMP_SAV_X:
2918            // Combo of state save to the next loc, + jmp backwards.
2919            //   Net effect on min. length computation is nothing.
2920            atStart = FALSE;
2921            break;
2922
2923        case URX_BACKTRACK:
2924            // Fails are kind of like a branch, except that the min length was
2925            //   propagated already, by the state save.
2926            currentLen = forwardedLength.elementAti(loc+1);
2927            atStart = FALSE;
2928            break;
2929
2930
2931        case URX_STATE_SAVE:
2932            {
2933                // State Save, for forward jumps, propagate the current minimum.
2934                //             of the state save.
2935                int32_t  jmpDest = URX_VAL(op);
2936                if (jmpDest > loc) {
2937                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
2938                        forwardedLength.setElementAt(currentLen, jmpDest);
2939                    }
2940                }
2941            }
2942            atStart = FALSE;
2943            break;
2944
2945
2946
2947
2948        case URX_STRING:
2949            {
2950                loc++;
2951                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2952                int32_t stringLen   = URX_VAL(stringLenOp);
2953                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2954                U_ASSERT(stringLenOp >= 2);
2955                if (currentLen == 0) {
2956                    // Add the starting character of this string to the set of possible starting
2957                    //   characters for this pattern.
2958                    int32_t stringStartIdx = URX_VAL(op);
2959                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2960                    fRXPat->fInitialChars->add(c);
2961
2962                    // Remember this string.  After the entire pattern has been checked,
2963                    //  if nothing else is identified that can start a match, we'll use it.
2964                    numInitialStrings++;
2965                    fRXPat->fInitialStringIdx = stringStartIdx;
2966                    fRXPat->fInitialStringLen = stringLen;
2967                }
2968
2969                currentLen += stringLen;
2970                atStart = FALSE;
2971            }
2972            break;
2973
2974        case URX_STRING_I:
2975            {
2976                // Case-insensitive string.  Unlike exact-match strings, we won't
2977                //   attempt a string search for possible match positions.  But we
2978                //   do update the set of possible starting characters.
2979                loc++;
2980                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2981                int32_t stringLen   = URX_VAL(stringLenOp);
2982                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2983                U_ASSERT(stringLenOp >= 2);
2984                if (currentLen == 0) {
2985                    // Add the starting character of this string to the set of possible starting
2986                    //   characters for this pattern.
2987                    int32_t stringStartIdx = URX_VAL(op);
2988                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2989                    UnicodeSet s;
2990                    findCaseInsensitiveStarters(c, &s);
2991                    fRXPat->fInitialChars->addAll(s);
2992                    numInitialStrings += 2;  // Matching on an initial string not possible.
2993                }
2994                currentLen += stringLen;
2995                atStart = FALSE;
2996            }
2997            break;
2998
2999        case URX_CTR_INIT:
3000        case URX_CTR_INIT_NG:
3001            {
3002                // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3003                //   so location must be updated accordingly.
3004                // Loop Init Ops.
3005                //   If the min loop count == 0
3006                //      move loc forwards to the end of the loop, skipping over the body.
3007                //   If the min count is > 0,
3008                //      continue normal processing of the body of the loop.
3009                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3010                        loopEndLoc   = URX_VAL(loopEndLoc);
3011                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3012                if (minLoopCount == 0) {
3013                    // Min Loop Count of 0, treat like a forward branch and
3014                    //   move the current minimum length up to the target
3015                    //   (end of loop) location.
3016                    U_ASSERT(loopEndLoc <= end+1);
3017                    if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3018                        forwardedLength.setElementAt(currentLen, loopEndLoc);
3019                    }
3020                }
3021                loc+=3;  // Skips over operands of CTR_INIT
3022            }
3023            atStart = FALSE;
3024            break;
3025
3026
3027        case URX_CTR_LOOP:
3028        case URX_CTR_LOOP_NG:
3029            // Loop ops.
3030            //  The jump is conditional, backwards only.
3031            atStart = FALSE;
3032            break;
3033
3034        case URX_LOOP_C:
3035            // More loop ops.  These state-save to themselves.
3036            //   don't change the minimum match
3037            atStart = FALSE;
3038            break;
3039
3040
3041        case URX_LA_START:
3042        case URX_LB_START:
3043            {
3044                // Look-around.  Scan forward until the matching look-ahead end,
3045                //   without processing the look-around block.  This is overly pessimistic.
3046
3047                // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3048                //   lookahead contains two LA_END instructions, so count goes up by two
3049                //   for each LA_START.
3050                int32_t  depth = (opType == URX_LA_START? 2: 1);
3051                for (;;) {
3052                    loc++;
3053                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3054                    if (URX_TYPE(op) == URX_LA_START) {
3055                        depth+=2;
3056                    }
3057                    if (URX_TYPE(op) == URX_LB_START) {
3058                        depth++;
3059                    }
3060                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3061                        depth--;
3062                        if (depth == 0) {
3063                            break;
3064                        }
3065                    }
3066                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3067                        // Need this because neg lookahead blocks will FAIL to outside
3068                        //   of the block.
3069                        int32_t  jmpDest = URX_VAL(op);
3070                        if (jmpDest > loc) {
3071                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3072                                forwardedLength.setElementAt(currentLen, jmpDest);
3073                            }
3074                        }
3075                    }
3076                    U_ASSERT(loc <= end);
3077                }
3078            }
3079            break;
3080
3081        case URX_LA_END:
3082        case URX_LB_CONT:
3083        case URX_LB_END:
3084        case URX_LBN_CONT:
3085        case URX_LBN_END:
3086            U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
3087                                 //  consumed by the scan in URX_LA_START and LB_START
3088
3089            break;
3090
3091        default:
3092            U_ASSERT(FALSE);
3093            }
3094
3095        }
3096
3097
3098    // We have finished walking through the ops.  Check whether some forward jump
3099    //   propagated a shorter length to location end+1.
3100    if (forwardedLength.elementAti(end+1) < currentLen) {
3101        currentLen = forwardedLength.elementAti(end+1);
3102    }
3103
3104
3105    fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3106
3107
3108    // Sort out what we should check for when looking for candidate match start positions.
3109    // In order of preference,
3110    //     1.   Start of input text buffer.
3111    //     2.   A literal string.
3112    //     3.   Start of line in multi-line mode.
3113    //     4.   A single literal character.
3114    //     5.   A character from a set of characters.
3115    //
3116    if (fRXPat->fStartType == START_START) {
3117        // Match only at the start of an input text string.
3118        //    start type is already set.  We're done.
3119    } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3120        // Match beginning only with a literal string.
3121        UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3122        U_ASSERT(fRXPat->fInitialChars->contains(c));
3123        fRXPat->fStartType   = START_STRING;
3124        fRXPat->fInitialChar = c;
3125    } else if (fRXPat->fStartType == START_LINE) {
3126        // Match at start of line in Multi-Line mode.
3127        // Nothing to do here; everything is already set.
3128    } else if (fRXPat->fMinMatchLen == 0) {
3129        // Zero length match possible.  We could start anywhere.
3130        fRXPat->fStartType = START_NO_INFO;
3131    } else if (fRXPat->fInitialChars->size() == 1) {
3132        // All matches begin with the same char.
3133        fRXPat->fStartType   = START_CHAR;
3134        fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3135        U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3136    } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3137        fRXPat->fMinMatchLen > 0) {
3138        // Matches start with a set of character smaller than the set of all chars.
3139        fRXPat->fStartType = START_SET;
3140    } else {
3141        // Matches can start with anything
3142        fRXPat->fStartType = START_NO_INFO;
3143    }
3144
3145    return;
3146}
3147
3148
3149
3150//------------------------------------------------------------------------------
3151//
3152//   minMatchLength    Calculate the length of the shortest string that could
3153//                     match the specified pattern.
3154//                     Length is in 16 bit code units, not code points.
3155//
3156//                     The calculated length may not be exact.  The returned
3157//                     value may be shorter than the actual minimum; it must
3158//                     never be longer.
3159//
3160//                     start and end are the range of p-code operations to be
3161//                     examined.  The endpoints are included in the range.
3162//
3163//------------------------------------------------------------------------------
3164int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3165    if (U_FAILURE(*fStatus)) {
3166        return 0;
3167    }
3168
3169    U_ASSERT(start <= end);
3170    U_ASSERT(end < fRXPat->fCompiledPat->size());
3171
3172
3173    int32_t    loc;
3174    int32_t    op;
3175    int32_t    opType;
3176    int32_t    currentLen = 0;
3177
3178
3179    // forwardedLength is a vector holding minimum-match-length values that
3180    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3181    //   It must be one longer than the pattern being checked because some  ops
3182    //   will jmp to a end-of-block+1 location from within a block, and we must
3183    //   count those when checking the block.
3184    UVector32  forwardedLength(end+2, *fStatus);
3185    forwardedLength.setSize(end+2);
3186    for (loc=start; loc<=end+1; loc++) {
3187        forwardedLength.setElementAt(INT32_MAX, loc);
3188    }
3189
3190    for (loc = start; loc<=end; loc++) {
3191        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3192        opType = URX_TYPE(op);
3193
3194        // The loop is advancing linearly through the pattern.
3195        // If the op we are now at was the destination of a branch in the pattern,
3196        // and that path has a shorter minimum length than the current accumulated value,
3197        // replace the current accumulated value.
3198        // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3199                                                               //   no-match-possible cases.
3200        if (forwardedLength.elementAti(loc) < currentLen) {
3201            currentLen = forwardedLength.elementAti(loc);
3202            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3203        }
3204
3205        switch (opType) {
3206            // Ops that don't change the total length matched
3207        case URX_RESERVED_OP:
3208        case URX_END:
3209        case URX_STRING_LEN:
3210        case URX_NOP:
3211        case URX_START_CAPTURE:
3212        case URX_END_CAPTURE:
3213        case URX_BACKSLASH_B:
3214        case URX_BACKSLASH_BU:
3215        case URX_BACKSLASH_G:
3216        case URX_BACKSLASH_Z:
3217        case URX_CARET:
3218        case URX_DOLLAR:
3219        case URX_DOLLAR_M:
3220        case URX_DOLLAR_D:
3221        case URX_DOLLAR_MD:
3222        case URX_RELOC_OPRND:
3223        case URX_STO_INP_LOC:
3224        case URX_CARET_M:
3225        case URX_CARET_M_UNIX:
3226        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3227        case URX_BACKREF_I:
3228
3229        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3230        case URX_LD_SP:
3231
3232        case URX_JMP_SAV:
3233        case URX_JMP_SAV_X:
3234            break;
3235
3236
3237            // Ops that match a minimum of one character (one or two 16 bit code units.)
3238            //
3239        case URX_ONECHAR:
3240        case URX_STATIC_SETREF:
3241        case URX_STAT_SETREF_N:
3242        case URX_SETREF:
3243        case URX_BACKSLASH_D:
3244        case URX_BACKSLASH_H:
3245        case URX_BACKSLASH_R:
3246        case URX_BACKSLASH_V:
3247        case URX_ONECHAR_I:
3248        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3249        case URX_DOTANY_ALL:    // . matches one or two.
3250        case URX_DOTANY:
3251        case URX_DOTANY_UNIX:
3252            currentLen++;
3253            break;
3254
3255
3256        case URX_JMPX:
3257            loc++;              // URX_JMPX has an extra operand, ignored here,
3258                                //   otherwise processed identically to URX_JMP.
3259        case URX_JMP:
3260            {
3261                int32_t  jmpDest = URX_VAL(op);
3262                if (jmpDest < loc) {
3263                    // Loop of some kind.  Can safely ignore, the worst that will happen
3264                    //  is that we understate the true minimum length
3265                    currentLen = forwardedLength.elementAti(loc+1);
3266                } else {
3267                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3268                    U_ASSERT(jmpDest <= end+1);
3269                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
3270                        forwardedLength.setElementAt(currentLen, jmpDest);
3271                    }
3272                }
3273            }
3274            break;
3275
3276        case URX_BACKTRACK:
3277            {
3278                // Back-tracks are kind of like a branch, except that the min length was
3279                //   propagated already, by the state save.
3280                currentLen = forwardedLength.elementAti(loc+1);
3281            }
3282            break;
3283
3284
3285        case URX_STATE_SAVE:
3286            {
3287                // State Save, for forward jumps, propagate the current minimum.
3288                //             of the state save.
3289                int32_t  jmpDest = URX_VAL(op);
3290                if (jmpDest > loc) {
3291                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
3292                        forwardedLength.setElementAt(currentLen, jmpDest);
3293                    }
3294                }
3295            }
3296            break;
3297
3298
3299        case URX_STRING:
3300            {
3301                loc++;
3302                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3303                currentLen += URX_VAL(stringLenOp);
3304            }
3305            break;
3306
3307
3308        case URX_STRING_I:
3309            {
3310                loc++;
3311                // TODO: with full case folding, matching input text may be shorter than
3312                //       the string we have here.  More smarts could put some bounds on it.
3313                //       Assume a min length of one for now.  A min length of zero causes
3314                //        optimization failures for a pattern like "string"+
3315                // currentLen += URX_VAL(stringLenOp);
3316                currentLen += 1;
3317            }
3318            break;
3319
3320        case URX_CTR_INIT:
3321        case URX_CTR_INIT_NG:
3322            {
3323                // Loop Init Ops.
3324                //   If the min loop count == 0
3325                //      move loc forwards to the end of the loop, skipping over the body.
3326                //   If the min count is > 0,
3327                //      continue normal processing of the body of the loop.
3328                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3329                        loopEndLoc   = URX_VAL(loopEndLoc);
3330                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3331                if (minLoopCount == 0) {
3332                    loc = loopEndLoc;
3333                } else {
3334                    loc+=3;  // Skips over operands of CTR_INIT
3335                }
3336            }
3337            break;
3338
3339
3340        case URX_CTR_LOOP:
3341        case URX_CTR_LOOP_NG:
3342            // Loop ops.
3343            //  The jump is conditional, backwards only.
3344            break;
3345
3346        case URX_LOOP_SR_I:
3347        case URX_LOOP_DOT_I:
3348        case URX_LOOP_C:
3349            // More loop ops.  These state-save to themselves.
3350            //   don't change the minimum match - could match nothing at all.
3351            break;
3352
3353
3354        case URX_LA_START:
3355        case URX_LB_START:
3356            {
3357                // Look-around.  Scan forward until the matching look-ahead end,
3358                //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3359                //   it assumes that the look-ahead match might be zero-length.
3360                //   TODO:  Positive lookahead could recursively do the block, then continue
3361                //          with the longer of the block or the value coming in.  Ticket 6060
3362                int32_t  depth = (opType == URX_LA_START? 2: 1);;
3363                for (;;) {
3364                    loc++;
3365                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3366                    if (URX_TYPE(op) == URX_LA_START) {
3367                        // The boilerplate for look-ahead includes two LA_END insturctions,
3368                        //    Depth will be decremented by each one when it is seen.
3369                        depth += 2;
3370                    }
3371                    if (URX_TYPE(op) == URX_LB_START) {
3372                        depth++;
3373                    }
3374                    if (URX_TYPE(op) == URX_LA_END) {
3375                        depth--;
3376                        if (depth == 0) {
3377                            break;
3378                        }
3379                    }
3380                    if (URX_TYPE(op)==URX_LBN_END) {
3381                        depth--;
3382                        if (depth == 0) {
3383                            break;
3384                        }
3385                    }
3386                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3387                        // Need this because neg lookahead blocks will FAIL to outside
3388                        //   of the block.
3389                        int32_t  jmpDest = URX_VAL(op);
3390                        if (jmpDest > loc) {
3391                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3392                                forwardedLength.setElementAt(currentLen, jmpDest);
3393                            }
3394                        }
3395                    }
3396                    U_ASSERT(loc <= end);
3397                }
3398            }
3399            break;
3400
3401        case URX_LA_END:
3402        case URX_LB_CONT:
3403        case URX_LB_END:
3404        case URX_LBN_CONT:
3405        case URX_LBN_END:
3406            // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3407            //   range being sized, which happens when measuring size of look-behind blocks.
3408            break;
3409
3410        default:
3411            U_ASSERT(FALSE);
3412            }
3413
3414        }
3415
3416    // We have finished walking through the ops.  Check whether some forward jump
3417    //   propagated a shorter length to location end+1.
3418    if (forwardedLength.elementAti(end+1) < currentLen) {
3419        currentLen = forwardedLength.elementAti(end+1);
3420        U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3421    }
3422
3423    return currentLen;
3424}
3425
3426// Increment with overflow check.
3427// val and delta will both be positive.
3428
3429static int32_t safeIncrement(int32_t val, int32_t delta) {
3430    if (INT32_MAX - val > delta) {
3431        return val + delta;
3432    } else {
3433        return INT32_MAX;
3434    }
3435}
3436
3437
3438//------------------------------------------------------------------------------
3439//
3440//   maxMatchLength    Calculate the length of the longest string that could
3441//                     match the specified pattern.
3442//                     Length is in 16 bit code units, not code points.
3443//
3444//                     The calculated length may not be exact.  The returned
3445//                     value may be longer than the actual maximum; it must
3446//                     never be shorter.
3447//
3448//------------------------------------------------------------------------------
3449int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3450    if (U_FAILURE(*fStatus)) {
3451        return 0;
3452    }
3453    U_ASSERT(start <= end);
3454    U_ASSERT(end < fRXPat->fCompiledPat->size());
3455
3456
3457    int32_t    loc;
3458    int32_t    op;
3459    int32_t    opType;
3460    int32_t    currentLen = 0;
3461    UVector32  forwardedLength(end+1, *fStatus);
3462    forwardedLength.setSize(end+1);
3463
3464    for (loc=start; loc<=end; loc++) {
3465        forwardedLength.setElementAt(0, loc);
3466    }
3467
3468    for (loc = start; loc<=end; loc++) {
3469        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3470        opType = URX_TYPE(op);
3471
3472        // The loop is advancing linearly through the pattern.
3473        // If the op we are now at was the destination of a branch in the pattern,
3474        // and that path has a longer maximum length than the current accumulated value,
3475        // replace the current accumulated value.
3476        if (forwardedLength.elementAti(loc) > currentLen) {
3477            currentLen = forwardedLength.elementAti(loc);
3478        }
3479
3480        switch (opType) {
3481            // Ops that don't change the total length matched
3482        case URX_RESERVED_OP:
3483        case URX_END:
3484        case URX_STRING_LEN:
3485        case URX_NOP:
3486        case URX_START_CAPTURE:
3487        case URX_END_CAPTURE:
3488        case URX_BACKSLASH_B:
3489        case URX_BACKSLASH_BU:
3490        case URX_BACKSLASH_G:
3491        case URX_BACKSLASH_Z:
3492        case URX_CARET:
3493        case URX_DOLLAR:
3494        case URX_DOLLAR_M:
3495        case URX_DOLLAR_D:
3496        case URX_DOLLAR_MD:
3497        case URX_RELOC_OPRND:
3498        case URX_STO_INP_LOC:
3499        case URX_CARET_M:
3500        case URX_CARET_M_UNIX:
3501
3502        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3503        case URX_LD_SP:
3504
3505        case URX_LB_END:
3506        case URX_LB_CONT:
3507        case URX_LBN_CONT:
3508        case URX_LBN_END:
3509            break;
3510
3511
3512            // Ops that increase that cause an unbounded increase in the length
3513            //   of a matched string, or that increase it a hard to characterize way.
3514            //   Call the max length unbounded, and stop further checking.
3515        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3516        case URX_BACKREF_I:
3517        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3518            currentLen = INT32_MAX;
3519            break;
3520
3521
3522            // Ops that match a max of one character (possibly two 16 bit code units.)
3523            //
3524        case URX_STATIC_SETREF:
3525        case URX_STAT_SETREF_N:
3526        case URX_SETREF:
3527        case URX_BACKSLASH_D:
3528        case URX_BACKSLASH_H:
3529        case URX_BACKSLASH_R:
3530        case URX_BACKSLASH_V:
3531        case URX_ONECHAR_I:
3532        case URX_DOTANY_ALL:
3533        case URX_DOTANY:
3534        case URX_DOTANY_UNIX:
3535            currentLen = safeIncrement(currentLen, 2);
3536            break;
3537
3538            // Single literal character.  Increase current max length by one or two,
3539            //       depending on whether the char is in the supplementary range.
3540        case URX_ONECHAR:
3541            currentLen = safeIncrement(currentLen, 1);
3542            if (URX_VAL(op) > 0x10000) {
3543                currentLen = safeIncrement(currentLen, 1);
3544            }
3545            break;
3546
3547            // Jumps.
3548            //
3549        case URX_JMP:
3550        case URX_JMPX:
3551        case URX_JMP_SAV:
3552        case URX_JMP_SAV_X:
3553            {
3554                int32_t  jmpDest = URX_VAL(op);
3555                if (jmpDest < loc) {
3556                    // Loop of some kind.  Max match length is unbounded.
3557                    currentLen = INT32_MAX;
3558                } else {
3559                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3560                    if (forwardedLength.elementAti(jmpDest) < currentLen) {
3561                        forwardedLength.setElementAt(currentLen, jmpDest);
3562                    }
3563                    currentLen = 0;
3564                }
3565            }
3566            break;
3567
3568        case URX_BACKTRACK:
3569            // back-tracks are kind of like a branch, except that the max length was
3570            //   propagated already, by the state save.
3571            currentLen = forwardedLength.elementAti(loc+1);
3572            break;
3573
3574
3575        case URX_STATE_SAVE:
3576            {
3577                // State Save, for forward jumps, propagate the current minimum.
3578                //               of the state save.
3579                //             For backwards jumps, they create a loop, maximum
3580                //               match length is unbounded.
3581                int32_t  jmpDest = URX_VAL(op);
3582                if (jmpDest > loc) {
3583                    if (currentLen > forwardedLength.elementAti(jmpDest)) {
3584                        forwardedLength.setElementAt(currentLen, jmpDest);
3585                    }
3586                } else {
3587                    currentLen = INT32_MAX;
3588                }
3589            }
3590            break;
3591
3592
3593
3594
3595        case URX_STRING:
3596            {
3597                loc++;
3598                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3599                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3600                break;
3601            }
3602
3603        case URX_STRING_I:
3604            // TODO:  This code assumes that any user string that matches will be no longer
3605            //        than our compiled string, with case insensitive matching.
3606            //        Our compiled string has been case-folded already.
3607            //
3608            //        Any matching user string will have no more code points than our
3609            //        compiled (folded) string.  Folding may add code points, but
3610            //        not remove them.
3611            //
3612            //        There is a potential problem if a supplemental code point
3613            //        case-folds to a BMP code point.  In this case our compiled string
3614            //        could be shorter (in code units) than a matching user string.
3615            //
3616            //        At this time (Unicode 6.1) there are no such characters, and this case
3617            //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3618            //        any problematic characters are added to Unicode.
3619            //
3620            //        If this happens, we can make a set of the BMP chars that the
3621            //        troublesome supplementals fold to, scan our string, and bump the
3622            //        currentLen one extra for each that is found.
3623            //
3624            {
3625                loc++;
3626                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3627                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3628            }
3629            break;
3630
3631        case URX_CTR_INIT:
3632        case URX_CTR_INIT_NG:
3633            // For Loops, recursively call this function on the pattern for the loop body,
3634            //   then multiply the result by the maximum loop count.
3635            {
3636                int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3637                if (loopEndLoc == loc+4) {
3638                    // Loop has an empty body. No affect on max match length.
3639                    // Continue processing with code after the loop end.
3640                    loc = loopEndLoc;
3641                    break;
3642                }
3643
3644                int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3645                if (maxLoopCount == -1) {
3646                    // Unbounded Loop. No upper bound on match length.
3647                    currentLen = INT32_MAX;
3648                    break;
3649                }
3650
3651                U_ASSERT(loopEndLoc >= loc+4);
3652                int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3653                int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3654                if (updatedLen >= INT32_MAX) {
3655                    currentLen = INT32_MAX;
3656                    break;
3657                }
3658                currentLen = (int32_t)updatedLen;
3659                loc = loopEndLoc;
3660                break;
3661            }
3662
3663        case URX_CTR_LOOP:
3664        case URX_CTR_LOOP_NG:
3665            // These opcodes will be skipped over by code for URX_CRT_INIT.
3666            // We shouldn't encounter them here.
3667            U_ASSERT(FALSE);
3668            break;
3669
3670        case URX_LOOP_SR_I:
3671        case URX_LOOP_DOT_I:
3672        case URX_LOOP_C:
3673            // For anything to do with loops, make the match length unbounded.
3674            currentLen = INT32_MAX;
3675            break;
3676
3677
3678
3679        case URX_LA_START:
3680        case URX_LA_END:
3681            // Look-ahead.  Just ignore, treat the look-ahead block as if
3682            // it were normal pattern.  Gives a too-long match length,
3683            //  but good enough for now.
3684            break;
3685
3686            // End of look-ahead ops should always be consumed by the processing at
3687            //  the URX_LA_START op.
3688            // U_ASSERT(FALSE);
3689            // break;
3690
3691        case URX_LB_START:
3692            {
3693                // Look-behind.  Scan forward until the matching look-around end,
3694                //   without processing the look-behind block.
3695                int32_t  depth = 0;
3696                for (;;) {
3697                    loc++;
3698                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3699                    if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3700                        depth++;
3701                    }
3702                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3703                        if (depth == 0) {
3704                            break;
3705                        }
3706                        depth--;
3707                    }
3708                    U_ASSERT(loc < end);
3709                }
3710            }
3711            break;
3712
3713        default:
3714            U_ASSERT(FALSE);
3715        }
3716
3717
3718        if (currentLen == INT32_MAX) {
3719            //  The maximum length is unbounded.
3720            //  Stop further processing of the pattern.
3721            break;
3722        }
3723
3724    }
3725    return currentLen;
3726
3727}
3728
3729
3730//------------------------------------------------------------------------------
3731//
3732//   stripNOPs    Remove any NOP operations from the compiled pattern code.
3733//                Extra NOPs are inserted for some constructs during the initial
3734//                code generation to provide locations that may be patched later.
3735//                Many end up unneeded, and are removed by this function.
3736//
3737//                In order to minimize the number of passes through the pattern,
3738//                back-reference fixup is also performed here (adjusting
3739//                back-reference operands to point to the correct frame offsets).
3740//
3741//------------------------------------------------------------------------------
3742void RegexCompile::stripNOPs() {
3743
3744    if (U_FAILURE(*fStatus)) {
3745        return;
3746    }
3747
3748    int32_t    end = fRXPat->fCompiledPat->size();
3749    UVector32  deltas(end, *fStatus);
3750
3751    // Make a first pass over the code, computing the amount that things
3752    //   will be offset at each location in the original code.
3753    int32_t   loc;
3754    int32_t   d = 0;
3755    for (loc=0; loc<end; loc++) {
3756        deltas.addElement(d, *fStatus);
3757        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3758        if (URX_TYPE(op) == URX_NOP) {
3759            d++;
3760        }
3761    }
3762
3763    UnicodeString caseStringBuffer;
3764
3765    // Make a second pass over the code, removing the NOPs by moving following
3766    //  code up, and patching operands that refer to code locations that
3767    //  are being moved.  The array of offsets from the first step is used
3768    //  to compute the new operand values.
3769    int32_t src;
3770    int32_t dst = 0;
3771    for (src=0; src<end; src++) {
3772        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3773        int32_t opType = URX_TYPE(op);
3774        switch (opType) {
3775        case URX_NOP:
3776            break;
3777
3778        case URX_STATE_SAVE:
3779        case URX_JMP:
3780        case URX_CTR_LOOP:
3781        case URX_CTR_LOOP_NG:
3782        case URX_RELOC_OPRND:
3783        case URX_JMPX:
3784        case URX_JMP_SAV:
3785        case URX_JMP_SAV_X:
3786            // These are instructions with operands that refer to code locations.
3787            {
3788                int32_t  operandAddress = URX_VAL(op);
3789                U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3790                int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3791                op = buildOp(opType, fixedOperandAddress);
3792                fRXPat->fCompiledPat->setElementAt(op, dst);
3793                dst++;
3794                break;
3795            }
3796
3797        case URX_BACKREF:
3798        case URX_BACKREF_I:
3799            {
3800                int32_t where = URX_VAL(op);
3801                if (where > fRXPat->fGroupMap->size()) {
3802                    error(U_REGEX_INVALID_BACK_REF);
3803                    break;
3804                }
3805                where = fRXPat->fGroupMap->elementAti(where-1);
3806                op    = buildOp(opType, where);
3807                fRXPat->fCompiledPat->setElementAt(op, dst);
3808                dst++;
3809
3810                fRXPat->fNeedsAltInput = TRUE;
3811                break;
3812            }
3813        case URX_RESERVED_OP:
3814        case URX_RESERVED_OP_N:
3815        case URX_BACKTRACK:
3816        case URX_END:
3817        case URX_ONECHAR:
3818        case URX_STRING:
3819        case URX_STRING_LEN:
3820        case URX_START_CAPTURE:
3821        case URX_END_CAPTURE:
3822        case URX_STATIC_SETREF:
3823        case URX_STAT_SETREF_N:
3824        case URX_SETREF:
3825        case URX_DOTANY:
3826        case URX_FAIL:
3827        case URX_BACKSLASH_B:
3828        case URX_BACKSLASH_BU:
3829        case URX_BACKSLASH_G:
3830        case URX_BACKSLASH_X:
3831        case URX_BACKSLASH_Z:
3832        case URX_DOTANY_ALL:
3833        case URX_BACKSLASH_D:
3834        case URX_CARET:
3835        case URX_DOLLAR:
3836        case URX_CTR_INIT:
3837        case URX_CTR_INIT_NG:
3838        case URX_DOTANY_UNIX:
3839        case URX_STO_SP:
3840        case URX_LD_SP:
3841        case URX_STO_INP_LOC:
3842        case URX_LA_START:
3843        case URX_LA_END:
3844        case URX_ONECHAR_I:
3845        case URX_STRING_I:
3846        case URX_DOLLAR_M:
3847        case URX_CARET_M:
3848        case URX_CARET_M_UNIX:
3849        case URX_LB_START:
3850        case URX_LB_CONT:
3851        case URX_LB_END:
3852        case URX_LBN_CONT:
3853        case URX_LBN_END:
3854        case URX_LOOP_SR_I:
3855        case URX_LOOP_DOT_I:
3856        case URX_LOOP_C:
3857        case URX_DOLLAR_D:
3858        case URX_DOLLAR_MD:
3859        case URX_BACKSLASH_H:
3860        case URX_BACKSLASH_R:
3861        case URX_BACKSLASH_V:
3862            // These instructions are unaltered by the relocation.
3863            fRXPat->fCompiledPat->setElementAt(op, dst);
3864            dst++;
3865            break;
3866
3867        default:
3868            // Some op is unaccounted for.
3869            U_ASSERT(FALSE);
3870            error(U_REGEX_INTERNAL_ERROR);
3871        }
3872    }
3873
3874    fRXPat->fCompiledPat->setSize(dst);
3875}
3876
3877
3878
3879
3880//------------------------------------------------------------------------------
3881//
3882//  Error         Report a rule parse error.
3883//                Only report it if no previous error has been recorded.
3884//
3885//------------------------------------------------------------------------------
3886void RegexCompile::error(UErrorCode e) {
3887    if (U_SUCCESS(*fStatus)) {
3888        *fStatus = e;
3889        // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3890        // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3891        // int64_t. If the values of the latter are out of range for the former,
3892        // set them to the appropriate "field not supported" values.
3893        if (fLineNum > 0x7FFFFFFF) {
3894            fParseErr->line   = 0;
3895            fParseErr->offset = -1;
3896        } else if (fCharNum > 0x7FFFFFFF) {
3897            fParseErr->line   = (int32_t)fLineNum;
3898            fParseErr->offset = -1;
3899        } else {
3900            fParseErr->line   = (int32_t)fLineNum;
3901            fParseErr->offset = (int32_t)fCharNum;
3902        }
3903
3904        UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3905
3906        // Fill in the context.
3907        //   Note: extractBetween() pins supplied indicies to the string bounds.
3908        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3909        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3910        utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3911        utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3912    }
3913}
3914
3915
3916//
3917//  Assorted Unicode character constants.
3918//     Numeric because there is no portable way to enter them as literals.
3919//     (Think EBCDIC).
3920//
3921static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3922static const UChar      chLF        = 0x0a;      // Line Feed
3923static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3924static const UChar      chDigit0    = 0x30;      // '0'
3925static const UChar      chDigit7    = 0x37;      // '9'
3926static const UChar      chColon     = 0x3A;      // ':'
3927static const UChar      chE         = 0x45;      // 'E'
3928static const UChar      chQ         = 0x51;      // 'Q'
3929//static const UChar      chN         = 0x4E;      // 'N'
3930static const UChar      chP         = 0x50;      // 'P'
3931static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3932//static const UChar      chLBracket  = 0x5b;      // '['
3933static const UChar      chRBracket  = 0x5d;      // ']'
3934static const UChar      chUp        = 0x5e;      // '^'
3935static const UChar      chLowerP    = 0x70;
3936static const UChar      chLBrace    = 0x7b;      // '{'
3937static const UChar      chRBrace    = 0x7d;      // '}'
3938static const UChar      chNEL       = 0x85;      //    NEL newline variant
3939static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3940
3941
3942//------------------------------------------------------------------------------
3943//
3944//  nextCharLL    Low Level Next Char from the regex pattern.
3945//                Get a char from the string, keep track of input position
3946//                     for error reporting.
3947//
3948//------------------------------------------------------------------------------
3949UChar32  RegexCompile::nextCharLL() {
3950    UChar32       ch;
3951
3952    if (fPeekChar != -1) {
3953        ch = fPeekChar;
3954        fPeekChar = -1;
3955        return ch;
3956    }
3957
3958    // assume we're already in the right place
3959    ch = UTEXT_NEXT32(fRXPat->fPattern);
3960    if (ch == U_SENTINEL) {
3961        return ch;
3962    }
3963
3964    if (ch == chCR ||
3965        ch == chNEL ||
3966        ch == chLS   ||
3967        (ch == chLF && fLastChar != chCR)) {
3968        // Character is starting a new line.  Bump up the line number, and
3969        //  reset the column to 0.
3970        fLineNum++;
3971        fCharNum=0;
3972    }
3973    else {
3974        // Character is not starting a new line.  Except in the case of a
3975        //   LF following a CR, increment the column position.
3976        if (ch != chLF) {
3977            fCharNum++;
3978        }
3979    }
3980    fLastChar = ch;
3981    return ch;
3982}
3983
3984//------------------------------------------------------------------------------
3985//
3986//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
3987//                 character without actually getting it.
3988//
3989//------------------------------------------------------------------------------
3990UChar32  RegexCompile::peekCharLL() {
3991    if (fPeekChar == -1) {
3992        fPeekChar = nextCharLL();
3993    }
3994    return fPeekChar;
3995}
3996
3997
3998//------------------------------------------------------------------------------
3999//
4000//   nextChar     for pattern scanning.  At this level, we handle stripping
4001//                out comments and processing some backslash character escapes.
4002//                The rest of the pattern grammar is handled at the next level up.
4003//
4004//------------------------------------------------------------------------------
4005void RegexCompile::nextChar(RegexPatternChar &c) {
4006
4007    fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4008    c.fChar    = nextCharLL();
4009    c.fQuoted  = FALSE;
4010
4011    if (fQuoteMode) {
4012        c.fQuoted = TRUE;
4013        if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4014            c.fChar == (UChar32)-1) {
4015            fQuoteMode = FALSE;  //  Exit quote mode,
4016            nextCharLL();        // discard the E
4017            nextChar(c);         // recurse to get the real next char
4018        }
4019    }
4020    else if (fInBackslashQuote) {
4021        // The current character immediately follows a '\'
4022        // Don't check for any further escapes, just return it as-is.
4023        // Don't set c.fQuoted, because that would prevent the state machine from
4024        //    dispatching on the character.
4025        fInBackslashQuote = FALSE;
4026    }
4027    else
4028    {
4029        // We are not in a \Q quoted region \E of the source.
4030        //
4031        if (fModeFlags & UREGEX_COMMENTS) {
4032            //
4033            // We are in free-spacing and comments mode.
4034            //  Scan through any white space and comments, until we
4035            //  reach a significant character or the end of inut.
4036            for (;;) {
4037                if (c.fChar == (UChar32)-1) {
4038                    break;     // End of Input
4039                }
4040                if  (c.fChar == chPound && fEOLComments == TRUE) {
4041                    // Start of a comment.  Consume the rest of it, until EOF or a new line
4042                    for (;;) {
4043                        c.fChar = nextCharLL();
4044                        if (c.fChar == (UChar32)-1 ||  // EOF
4045                            c.fChar == chCR        ||
4046                            c.fChar == chLF        ||
4047                            c.fChar == chNEL       ||
4048                            c.fChar == chLS)       {
4049                            break;
4050                        }
4051                    }
4052                }
4053                // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4054                if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4055                    break;
4056                }
4057                c.fChar = nextCharLL();
4058            }
4059        }
4060
4061        //
4062        //  check for backslash escaped characters.
4063        //
4064        if (c.fChar == chBackSlash) {
4065            int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4066            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4067                //
4068                // A '\' sequence that is handled by ICU's standard unescapeAt function.
4069                //   Includes \uxxxx, \n, \r, many others.
4070                //   Return the single equivalent character.
4071                //
4072                nextCharLL();                 // get & discard the peeked char.
4073                c.fQuoted = TRUE;
4074
4075                if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4076                    int32_t endIndex = (int32_t)pos;
4077                    c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4078
4079                    if (endIndex == pos) {
4080                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4081                    }
4082                    fCharNum += endIndex - pos;
4083                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4084                } else {
4085                    int32_t offset = 0;
4086                    struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4087
4088                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4089                    c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4090
4091                    if (offset == 0) {
4092                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4093                    } else if (context.lastOffset == offset) {
4094                        UTEXT_PREVIOUS32(fRXPat->fPattern);
4095                    } else if (context.lastOffset != offset-1) {
4096                        utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4097                    }
4098                    fCharNum += offset;
4099                }
4100            }
4101            else if (peekCharLL() == chDigit0) {
4102                //  Octal Escape, using Java Regexp Conventions
4103                //    which are \0 followed by 1-3 octal digits.
4104                //    Different from ICU Unescape handling of Octal, which does not
4105                //    require the leading 0.
4106                //  Java also has the convention of only consuming 2 octal digits if
4107                //    the three digit number would be > 0xff
4108                //
4109                c.fChar = 0;
4110                nextCharLL();    // Consume the initial 0.
4111                int index;
4112                for (index=0; index<3; index++) {
4113                    int32_t ch = peekCharLL();
4114                    if (ch<chDigit0 || ch>chDigit7) {
4115                        if (index==0) {
4116                           // \0 is not followed by any octal digits.
4117                           error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4118                        }
4119                        break;
4120                    }
4121                    c.fChar <<= 3;
4122                    c.fChar += ch&7;
4123                    if (c.fChar <= 255) {
4124                        nextCharLL();
4125                    } else {
4126                        // The last digit made the number too big.  Forget we saw it.
4127                        c.fChar >>= 3;
4128                    }
4129                }
4130                c.fQuoted = TRUE;
4131            }
4132            else if (peekCharLL() == chQ) {
4133                //  "\Q"  enter quote mode, which will continue until "\E"
4134                fQuoteMode = TRUE;
4135                nextCharLL();       // discard the 'Q'.
4136                nextChar(c);        // recurse to get the real next char.
4137            }
4138            else
4139            {
4140                // We are in a '\' escape that will be handled by the state table scanner.
4141                // Just return the backslash, but remember that the following char is to
4142                //  be taken literally.
4143                fInBackslashQuote = TRUE;
4144            }
4145        }
4146    }
4147
4148    // re-enable # to end-of-line comments, in case they were disabled.
4149    // They are disabled by the parser upon seeing '(?', but this lasts for
4150    //  the fetching of the next character only.
4151    fEOLComments = TRUE;
4152
4153    // putc(c.fChar, stdout);
4154}
4155
4156
4157
4158//------------------------------------------------------------------------------
4159//
4160//  scanNamedChar
4161//            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4162//
4163//             The scan position will be at the 'N'.  On return
4164//             the scan position should be just after the '}'
4165//
4166//             Return the UChar32
4167//
4168//------------------------------------------------------------------------------
4169UChar32  RegexCompile::scanNamedChar() {
4170    if (U_FAILURE(*fStatus)) {
4171        return 0;
4172    }
4173
4174    nextChar(fC);
4175    if (fC.fChar != chLBrace) {
4176        error(U_REGEX_PROPERTY_SYNTAX);
4177        return 0;
4178    }
4179
4180    UnicodeString  charName;
4181    for (;;) {
4182        nextChar(fC);
4183        if (fC.fChar == chRBrace) {
4184            break;
4185        }
4186        if (fC.fChar == -1) {
4187            error(U_REGEX_PROPERTY_SYNTAX);
4188            return 0;
4189        }
4190        charName.append(fC.fChar);
4191    }
4192
4193    char name[100];
4194    if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4195         (uint32_t)charName.length()>=sizeof(name)) {
4196        // All Unicode character names have only invariant characters.
4197        // The API to get a character, given a name, accepts only char *, forcing us to convert,
4198        //   which requires this error check
4199        error(U_REGEX_PROPERTY_SYNTAX);
4200        return 0;
4201    }
4202    charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4203
4204    UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4205    if (U_FAILURE(*fStatus)) {
4206        error(U_REGEX_PROPERTY_SYNTAX);
4207    }
4208
4209    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4210    return theChar;
4211}
4212
4213//------------------------------------------------------------------------------
4214//
4215//  scanProp   Construct a UnicodeSet from the text at the current scan
4216//             position, which will be of the form \p{whaterver}
4217//
4218//             The scan position will be at the 'p' or 'P'.  On return
4219//             the scan position should be just after the '}'
4220//
4221//             Return a UnicodeSet, constructed from the \P pattern,
4222//             or NULL if the pattern is invalid.
4223//
4224//------------------------------------------------------------------------------
4225UnicodeSet *RegexCompile::scanProp() {
4226    UnicodeSet    *uset = NULL;
4227
4228    if (U_FAILURE(*fStatus)) {
4229        return NULL;
4230    }
4231    (void)chLowerP;   // Suppress compiler unused variable warning.
4232    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4233    UBool negated = (fC.fChar == chP);
4234
4235    UnicodeString propertyName;
4236    nextChar(fC);
4237    if (fC.fChar != chLBrace) {
4238        error(U_REGEX_PROPERTY_SYNTAX);
4239        return NULL;
4240    }
4241    for (;;) {
4242        nextChar(fC);
4243        if (fC.fChar == chRBrace) {
4244            break;
4245        }
4246        if (fC.fChar == -1) {
4247            // Hit the end of the input string without finding the closing '}'
4248            error(U_REGEX_PROPERTY_SYNTAX);
4249            return NULL;
4250        }
4251        propertyName.append(fC.fChar);
4252    }
4253    uset = createSetForProperty(propertyName, negated);
4254    nextChar(fC);    // Move input scan to position following the closing '}'
4255    return uset;
4256}
4257
4258//------------------------------------------------------------------------------
4259//
4260//  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4261//             position, which is expected be of the form [:property expression:]
4262//
4263//             The scan position will be at the opening ':'.  On return
4264//             the scan position must be on the closing ']'
4265//
4266//             Return a UnicodeSet constructed from the pattern,
4267//             or NULL if this is not a valid POSIX-style set expression.
4268//             If not a property expression, restore the initial scan position
4269//                (to the opening ':')
4270//
4271//               Note:  the opening '[:' is not sufficient to guarantee that
4272//                      this is a [:property:] expression.
4273//                      [:'+=,] is a perfectly good ordinary set expression that
4274//                              happens to include ':' as one of its characters.
4275//
4276//------------------------------------------------------------------------------
4277UnicodeSet *RegexCompile::scanPosixProp() {
4278    UnicodeSet    *uset = NULL;
4279
4280    if (U_FAILURE(*fStatus)) {
4281        return NULL;
4282    }
4283
4284    U_ASSERT(fC.fChar == chColon);
4285
4286    // Save the scanner state.
4287    // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4288    int64_t     savedScanIndex        = fScanIndex;
4289    int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4290    UBool       savedQuoteMode        = fQuoteMode;
4291    UBool       savedInBackslashQuote = fInBackslashQuote;
4292    UBool       savedEOLComments      = fEOLComments;
4293    int64_t     savedLineNum          = fLineNum;
4294    int64_t     savedCharNum          = fCharNum;
4295    UChar32     savedLastChar         = fLastChar;
4296    UChar32     savedPeekChar         = fPeekChar;
4297    RegexPatternChar savedfC          = fC;
4298
4299    // Scan for a closing ].   A little tricky because there are some perverse
4300    //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4301    //   ending on the second closing ].
4302
4303    UnicodeString propName;
4304    UBool         negated  = FALSE;
4305
4306    // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4307    nextChar(fC);
4308    if (fC.fChar == chUp) {
4309       negated = TRUE;
4310       nextChar(fC);
4311    }
4312
4313    // Scan for the closing ":]", collecting the property name along the way.
4314    UBool  sawPropSetTerminator = FALSE;
4315    for (;;) {
4316        propName.append(fC.fChar);
4317        nextChar(fC);
4318        if (fC.fQuoted || fC.fChar == -1) {
4319            // Escaped characters or end of input - either says this isn't a [:Property:]
4320            break;
4321        }
4322        if (fC.fChar == chColon) {
4323            nextChar(fC);
4324            if (fC.fChar == chRBracket) {
4325                sawPropSetTerminator = TRUE;
4326            }
4327            break;
4328        }
4329    }
4330
4331    if (sawPropSetTerminator) {
4332        uset = createSetForProperty(propName, negated);
4333    }
4334    else
4335    {
4336        // No closing ":]".
4337        //  Restore the original scan position.
4338        //  The main scanner will retry the input as a normal set expression,
4339        //    not a [:Property:] expression.
4340        fScanIndex        = savedScanIndex;
4341        fQuoteMode        = savedQuoteMode;
4342        fInBackslashQuote = savedInBackslashQuote;
4343        fEOLComments      = savedEOLComments;
4344        fLineNum          = savedLineNum;
4345        fCharNum          = savedCharNum;
4346        fLastChar         = savedLastChar;
4347        fPeekChar         = savedPeekChar;
4348        fC                = savedfC;
4349        UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4350    }
4351    return uset;
4352}
4353
4354static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4355    set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4356    addCategory(set, U_GC_CF_MASK, ec);
4357}
4358
4359//
4360//  Create a Unicode Set from a Unicode Property expression.
4361//     This is common code underlying both \p{...} ane [:...:] expressions.
4362//     Includes trying the Java "properties" that aren't supported as
4363//     normal ICU UnicodeSet properties
4364//
4365static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4366static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4367UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4368    UnicodeString   setExpr;
4369    UnicodeSet      *set;
4370    uint32_t        usetFlags = 0;
4371
4372    if (U_FAILURE(*fStatus)) {
4373        return NULL;
4374    }
4375
4376    //
4377    //  First try the property as we received it
4378    //
4379    if (negated) {
4380        setExpr.append(negSetPrefix, -1);
4381    } else {
4382        setExpr.append(posSetPrefix, -1);
4383    }
4384    setExpr.append(propName);
4385    setExpr.append(chRBrace);
4386    setExpr.append(chRBracket);
4387    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4388        usetFlags |= USET_CASE_INSENSITIVE;
4389    }
4390    set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4391    if (U_SUCCESS(*fStatus)) {
4392       return set;
4393    }
4394    delete set;
4395    set = NULL;
4396
4397    //
4398    //  The property as it was didn't work.
4399
4400    //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
4401    //     or standard Java, but many other regular expression packages do recognize it.
4402
4403    if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4404        *fStatus = U_ZERO_ERROR;
4405        set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4406        if (set == NULL) {
4407            *fStatus = U_MEMORY_ALLOCATION_ERROR;
4408            return set;
4409        }
4410        if (negated) {
4411            set->complement();
4412        }
4413        return set;
4414    }
4415
4416
4417    //    Do Java fixes -
4418    //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4419    //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4420    //
4421    //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4422    //                        is accepted by Java.  The property part of the name is compared
4423    //                        case-insenstively.  The spaces must be exactly as shown, either
4424    //                        all there, or all omitted, with exactly one at each position
4425    //                        if they are present.  From checking against JDK 1.6
4426    //
4427    //       This code should be removed when ICU properties support the Java  compatibility names
4428    //          (ICU 4.0?)
4429    //
4430    UnicodeString mPropName = propName;
4431    if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4432        mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4433    }
4434    if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4435        mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4436        mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4437    }
4438    else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4439        mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4440    }
4441
4442    //    See if the property looks like a Java "InBlockName", which
4443    //    we will recast as "Block=BlockName"
4444    //
4445    static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
4446    static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
4447    if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4448        setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
4449        setExpr.append(BLOCK, -1);
4450        setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
4451        setExpr.append(chRBrace);
4452        setExpr.append(chRBracket);
4453        *fStatus = U_ZERO_ERROR;
4454        set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4455        if (U_SUCCESS(*fStatus)) {
4456            return set;
4457        }
4458        delete set;
4459        set = NULL;
4460    }
4461
4462    if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4463        propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4464    {
4465        UErrorCode localStatus = U_ZERO_ERROR;
4466        //setExpr.remove();
4467        set = new UnicodeSet();
4468        //
4469        //  Try the various Java specific properties.
4470        //   These all begin with "java"
4471        //
4472        if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4473            addCategory(set, U_GC_CN_MASK, localStatus);
4474            set->complement();
4475        }
4476        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4477            addCategory(set, U_GC_ND_MASK, localStatus);
4478        }
4479        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4480            addIdentifierIgnorable(set, localStatus);
4481        }
4482        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4483            set->add(0, 0x1F).add(0x7F, 0x9F);
4484        }
4485        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4486            addCategory(set, U_GC_L_MASK, localStatus);
4487            addCategory(set, U_GC_SC_MASK, localStatus);
4488            addCategory(set, U_GC_PC_MASK, localStatus);
4489            addCategory(set, U_GC_ND_MASK, localStatus);
4490            addCategory(set, U_GC_NL_MASK, localStatus);
4491            addCategory(set, U_GC_MC_MASK, localStatus);
4492            addCategory(set, U_GC_MN_MASK, localStatus);
4493            addIdentifierIgnorable(set, localStatus);
4494        }
4495        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4496            addCategory(set, U_GC_L_MASK, localStatus);
4497            addCategory(set, U_GC_NL_MASK, localStatus);
4498            addCategory(set, U_GC_SC_MASK, localStatus);
4499            addCategory(set, U_GC_PC_MASK, localStatus);
4500        }
4501        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4502            addCategory(set, U_GC_L_MASK, localStatus);
4503        }
4504        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4505            addCategory(set, U_GC_L_MASK, localStatus);
4506            addCategory(set, U_GC_ND_MASK, localStatus);
4507        }
4508        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4509            addCategory(set, U_GC_LL_MASK, localStatus);
4510        }
4511        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4512            set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4513        }
4514        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4515            addCategory(set, U_GC_Z_MASK, localStatus);
4516        }
4517        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4518            set->add(0x10000, UnicodeSet::MAX_VALUE);
4519        }
4520        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4521            addCategory(set, U_GC_LT_MASK, localStatus);
4522        }
4523        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4524            addCategory(set, U_GC_L_MASK, localStatus);
4525            addCategory(set, U_GC_NL_MASK, localStatus);
4526        }
4527        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4528            addCategory(set, U_GC_L_MASK, localStatus);
4529            addCategory(set, U_GC_PC_MASK, localStatus);
4530            addCategory(set, U_GC_ND_MASK, localStatus);
4531            addCategory(set, U_GC_NL_MASK, localStatus);
4532            addCategory(set, U_GC_MC_MASK, localStatus);
4533            addCategory(set, U_GC_MN_MASK, localStatus);
4534            addIdentifierIgnorable(set, localStatus);
4535        }
4536        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4537            addCategory(set, U_GC_LU_MASK, localStatus);
4538        }
4539        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4540            set->add(0, UnicodeSet::MAX_VALUE);
4541        }
4542        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4543            addCategory(set, U_GC_Z_MASK, localStatus);
4544            set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4545            set->add(9, 0x0d).add(0x1c, 0x1f);
4546        }
4547        else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4548            set->add(0, UnicodeSet::MAX_VALUE);
4549        }
4550
4551        if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4552            *fStatus = U_ZERO_ERROR;
4553            if (usetFlags & USET_CASE_INSENSITIVE) {
4554                set->closeOver(USET_CASE_INSENSITIVE);
4555            }
4556            if (negated) {
4557                set->complement();
4558            }
4559            return set;
4560        }
4561        delete set;
4562        set = NULL;
4563    }
4564    error(*fStatus);
4565    return NULL;
4566}
4567
4568
4569
4570//
4571//  SetEval   Part of the evaluation of [set expressions].
4572//            Perform any pending (stacked) operations with precedence
4573//            equal or greater to that of the next operator encountered
4574//            in the expression.
4575//
4576void RegexCompile::setEval(int32_t nextOp) {
4577    UnicodeSet *rightOperand = NULL;
4578    UnicodeSet *leftOperand  = NULL;
4579    for (;;) {
4580        U_ASSERT(fSetOpStack.empty()==FALSE);
4581        int32_t pendingSetOperation = fSetOpStack.peeki();
4582        if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4583            break;
4584        }
4585        fSetOpStack.popi();
4586        U_ASSERT(fSetStack.empty() == FALSE);
4587        rightOperand = (UnicodeSet *)fSetStack.peek();
4588        switch (pendingSetOperation) {
4589            case setNegation:
4590                rightOperand->complement();
4591                break;
4592            case setCaseClose:
4593                // TODO: need a simple close function.  Ticket 6065
4594                rightOperand->closeOver(USET_CASE_INSENSITIVE);
4595                rightOperand->removeAllStrings();
4596                break;
4597            case setDifference1:
4598            case setDifference2:
4599                fSetStack.pop();
4600                leftOperand = (UnicodeSet *)fSetStack.peek();
4601                leftOperand->removeAll(*rightOperand);
4602                delete rightOperand;
4603                break;
4604            case setIntersection1:
4605            case setIntersection2:
4606                fSetStack.pop();
4607                leftOperand = (UnicodeSet *)fSetStack.peek();
4608                leftOperand->retainAll(*rightOperand);
4609                delete rightOperand;
4610                break;
4611            case setUnion:
4612                fSetStack.pop();
4613                leftOperand = (UnicodeSet *)fSetStack.peek();
4614                leftOperand->addAll(*rightOperand);
4615                delete rightOperand;
4616                break;
4617            default:
4618                U_ASSERT(FALSE);
4619                break;
4620            }
4621        }
4622    }
4623
4624void RegexCompile::setPushOp(int32_t op) {
4625    setEval(op);
4626    fSetOpStack.push(op, *fStatus);
4627    fSetStack.push(new UnicodeSet(), *fStatus);
4628}
4629
4630U_NAMESPACE_END
4631#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4632
4633