1//
2//  file:  regexcmp.cpp
3//
4//  Copyright (C) 2002-2014 International Business Machines Corporation and others.
5//  All Rights Reserved.
6//
7//  This file contains the ICU regular expression compiler, which is responsible
8//  for processing a regular expression pattern into the compiled form that
9//  is used by the match finding engine.
10//
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16#include "unicode/ustring.h"
17#include "unicode/unistr.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/uchriter.h"
21#include "unicode/parsepos.h"
22#include "unicode/parseerr.h"
23#include "unicode/regex.h"
24#include "unicode/utf.h"
25#include "unicode/utf16.h"
26#include "patternprops.h"
27#include "putilimp.h"
28#include "cmemory.h"
29#include "cstring.h"
30#include "uvectr32.h"
31#include "uvectr64.h"
32#include "uassert.h"
33#include "ucln_in.h"
34#include "uinvchar.h"
35
36#include "regeximp.h"
37#include "regexcst.h"   // Contains state table for the regex pattern parser.
38                        //   generated by a Perl script.
39#include "regexcmp.h"
40#include "regexst.h"
41#include "regextxt.h"
42
43
44
45U_NAMESPACE_BEGIN
46
47
48//------------------------------------------------------------------------------
49//
50//  Constructor.
51//
52//------------------------------------------------------------------------------
53RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
54   fParenStack(status), fSetStack(status), fSetOpStack(status)
55{
56    // Lazy init of all shared global sets (needed for init()'s empty text)
57    RegexStaticSets::initGlobals(&status);
58
59    fStatus           = &status;
60
61    fRXPat            = rxp;
62    fScanIndex        = 0;
63    fLastChar         = -1;
64    fPeekChar         = -1;
65    fLineNum          = 1;
66    fCharNum          = 0;
67    fQuoteMode        = FALSE;
68    fInBackslashQuote = FALSE;
69    fModeFlags        = fRXPat->fFlags | 0x80000000;
70    fEOLComments      = TRUE;
71
72    fMatchOpenParen   = -1;
73    fMatchCloseParen  = -1;
74
75    if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
76        status = rxp->fDeferredStatus;
77    }
78}
79
80static const UChar      chAmp       = 0x26;      // '&'
81static const UChar      chDash      = 0x2d;      // '-'
82
83
84//------------------------------------------------------------------------------
85//
86//  Destructor
87//
88//------------------------------------------------------------------------------
89RegexCompile::~RegexCompile() {
90}
91
92static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
93    set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
94}
95
96//------------------------------------------------------------------------------
97//
98//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
99//                           The state tables are hand-written in the file regexcst.txt,
100//                           and converted to the form used here by a perl
101//                           script regexcst.pl
102//
103//------------------------------------------------------------------------------
104void    RegexCompile::compile(
105                         const UnicodeString &pat,   // Source pat to be compiled.
106                         UParseError &pp,            // Error position info
107                         UErrorCode &e)              // Error Code
108{
109    fRXPat->fPatternString = new UnicodeString(pat);
110    UText patternText = UTEXT_INITIALIZER;
111    utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
112
113    if (U_SUCCESS(e)) {
114        compile(&patternText, pp, e);
115        utext_close(&patternText);
116    }
117}
118
119//
120//   compile, UText mode
121//     All the work is actually done here.
122//
123void    RegexCompile::compile(
124                         UText *pat,                 // Source pat to be compiled.
125                         UParseError &pp,            // Error position info
126                         UErrorCode &e)              // Error Code
127{
128    fStatus             = &e;
129    fParseErr           = &pp;
130    fStackPtr           = 0;
131    fStack[fStackPtr]   = 0;
132
133    if (U_FAILURE(*fStatus)) {
134        return;
135    }
136
137    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
138    U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
139
140    // Prepare the RegexPattern object to receive the compiled pattern.
141    fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
142    fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
143    fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
144
145
146    // Initialize the pattern scanning state machine
147    fPatternLength = utext_nativeLength(pat);
148    uint16_t                state = 1;
149    const RegexTableEl      *tableEl;
150
151    // UREGEX_LITERAL force entire pattern to be treated as a literal string.
152    if (fModeFlags & UREGEX_LITERAL) {
153        fQuoteMode = TRUE;
154    }
155
156    nextChar(fC);                        // Fetch the first char from the pattern string.
157
158    //
159    // Main loop for the regex pattern parsing state machine.
160    //   Runs once per state transition.
161    //   Each time through optionally performs, depending on the state table,
162    //      - an advance to the the next pattern char
163    //      - an action to be performed.
164    //      - pushing or popping a state to/from the local state return stack.
165    //   file regexcst.txt is the source for the state table.  The logic behind
166    //     recongizing the pattern syntax is there, not here.
167    //
168    for (;;) {
169        //  Bail out if anything has gone wrong.
170        //  Regex pattern parsing stops on the first error encountered.
171        if (U_FAILURE(*fStatus)) {
172            break;
173        }
174
175        U_ASSERT(state != 0);
176
177        // Find the state table element that matches the input char from the pattern, or the
178        //    class of the input character.  Start with the first table row for this
179        //    state, then linearly scan forward until we find a row that matches the
180        //    character.  The last row for each state always matches all characters, so
181        //    the search will stop there, if not before.
182        //
183        tableEl = &gRuleParseStateTable[state];
184        REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
185            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
186
187        for (;;) {    // loop through table rows belonging to this state, looking for one
188                      //   that matches the current input char.
189            REGEX_SCAN_DEBUG_PRINTF(("."));
190            if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
191                // Table row specified an individual character, not a set, and
192                //   the input character is not quoted, and
193                //   the input character matched it.
194                break;
195            }
196            if (tableEl->fCharClass == 255) {
197                // Table row specified default, match anything character class.
198                break;
199            }
200            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
201                // Table row specified "quoted" and the char was quoted.
202                break;
203            }
204            if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
205                // Table row specified eof and we hit eof on the input.
206                break;
207            }
208
209            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
210                fC.fQuoted == FALSE &&                                       //   char is not escaped &&
211                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
212                U_ASSERT(tableEl->fCharClass <= 137);
213                if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
214                    // Table row specified a character class, or set of characters,
215                    //   and the current char matches it.
216                    break;
217                }
218            }
219
220            // No match on this row, advance to the next  row for this state,
221            tableEl++;
222        }
223        REGEX_SCAN_DEBUG_PRINTF(("\n"));
224
225        //
226        // We've found the row of the state table that matches the current input
227        //   character from the rules string.
228        // Perform any action specified  by this row in the state table.
229        if (doParseActions(tableEl->fAction) == FALSE) {
230            // Break out of the state machine loop if the
231            //   the action signalled some kind of error, or
232            //   the action was to exit, occurs on normal end-of-rules-input.
233            break;
234        }
235
236        if (tableEl->fPushState != 0) {
237            fStackPtr++;
238            if (fStackPtr >= kStackSize) {
239                error(U_REGEX_INTERNAL_ERROR);
240                REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
241                fStackPtr--;
242            }
243            fStack[fStackPtr] = tableEl->fPushState;
244        }
245
246        //
247        //  NextChar.  This is where characters are actually fetched from the pattern.
248        //             Happens under control of the 'n' tag in the state table.
249        //
250        if (tableEl->fNextChar) {
251            nextChar(fC);
252        }
253
254        // Get the next state from the table entry, or from the
255        //   state stack if the next state was specified as "pop".
256        if (tableEl->fNextState != 255) {
257            state = tableEl->fNextState;
258        } else {
259            state = fStack[fStackPtr];
260            fStackPtr--;
261            if (fStackPtr < 0) {
262                // state stack underflow
263                // This will occur if the user pattern has mis-matched parentheses,
264                //   with extra close parens.
265                //
266                fStackPtr++;
267                error(U_REGEX_MISMATCHED_PAREN);
268            }
269        }
270
271    }
272
273    if (U_FAILURE(*fStatus)) {
274        // Bail out if the pattern had errors.
275        //   Set stack cleanup:  a successful compile would have left it empty,
276        //   but errors can leave temporary sets hanging around.
277        while (!fSetStack.empty()) {
278            delete (UnicodeSet *)fSetStack.pop();
279        }
280        return;
281    }
282
283    //
284    // The pattern has now been read and processed, and the compiled code generated.
285    //
286
287    //
288    // Compute the number of digits requried for the largest capture group number.
289    //
290    fRXPat->fMaxCaptureDigits = 1;
291    int32_t  n = 10;
292    int32_t  groupCount = fRXPat->fGroupMap->size();
293    while (n <= groupCount) {
294        fRXPat->fMaxCaptureDigits++;
295        n *= 10;
296    }
297
298    //
299    // The pattern's fFrameSize so far has accumulated the requirements for
300    //   storage for capture parentheses, counters, etc. that are encountered
301    //   in the pattern.  Add space for the two variables that are always
302    //   present in the saved state:  the input string position (int64_t) and
303    //   the position in the compiled pattern.
304    //
305    fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT;
306
307    //
308    // Optimization pass 1: NOPs, back-references, and case-folding
309    //
310    stripNOPs();
311
312    //
313    // Get bounds for the minimum and maximum length of a string that this
314    //   pattern can match.  Used to avoid looking for matches in strings that
315    //   are too short.
316    //
317    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
318
319    //
320    // Optimization pass 2: match start type
321    //
322    matchStartType();
323
324    //
325    // Set up fast latin-1 range sets
326    //
327    int32_t numSets = fRXPat->fSets->size();
328    fRXPat->fSets8 = new Regex8BitSet[numSets];
329    // Null pointer check.
330    if (fRXPat->fSets8 == NULL) {
331        e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
332        return;
333    }
334    int32_t i;
335    for (i=0; i<numSets; i++) {
336        UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
337        fRXPat->fSets8[i].init(s);
338    }
339
340}
341
342
343
344
345
346//------------------------------------------------------------------------------
347//
348//  doParseAction        Do some action during regex pattern parsing.
349//                       Called by the parse state machine.
350//
351//                       Generation of the match engine PCode happens here, or
352//                       in functions called from the parse actions defined here.
353//
354//
355//------------------------------------------------------------------------------
356UBool RegexCompile::doParseActions(int32_t action)
357{
358    UBool   returnVal = TRUE;
359
360    switch ((Regex_PatternParseAction)action) {
361
362    case doPatStart:
363        // Start of pattern compiles to:
364        //0   SAVE   2        Fall back to position of FAIL
365        //1   jmp    3
366        //2   FAIL            Stop if we ever reach here.
367        //3   NOP             Dummy, so start of pattern looks the same as
368        //                    the start of an ( grouping.
369        //4   NOP             Resreved, will be replaced by a save if there are
370        //                    OR | operators at the top level
371        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus);
372        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP,  3), *fStatus);
373        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus);
374
375        // Standard open nonCapture paren action emits the two NOPs and
376        //   sets up the paren stack frame.
377        doParseActions(doOpenNonCaptureParen);
378        break;
379
380    case doPatFinish:
381        // We've scanned to the end of the pattern
382        //  The end of pattern compiles to:
383        //        URX_END
384        //    which will stop the runtime match engine.
385        //  Encountering end of pattern also behaves like a close paren,
386        //   and forces fixups of the State Save at the beginning of the compiled pattern
387        //   and of any OR operations at the top level.
388        //
389        handleCloseParen();
390        if (fParenStack.size() > 0) {
391            // Missing close paren in pattern.
392            error(U_REGEX_MISMATCHED_PAREN);
393        }
394
395        // add the END operation to the compiled pattern.
396        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
397
398        // Terminate the pattern compilation state machine.
399        returnVal = FALSE;
400        break;
401
402
403
404    case doOrOperator:
405        // Scanning a '|', as in (A|B)
406        {
407            // Generate code for any pending literals preceding the '|'
408            fixLiterals(FALSE);
409
410            // Insert a SAVE operation at the start of the pattern section preceding
411            //   this OR at this level.  This SAVE will branch the match forward
412            //   to the right hand side of the OR in the event that the left hand
413            //   side fails to match and backtracks.  Locate the position for the
414            //   save from the location on the top of the parentheses stack.
415            int32_t savePosition = fParenStack.popi();
416            int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
417            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
418            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
419            fRXPat->fCompiledPat->setElementAt(op, savePosition);
420
421            // Append an JMP operation into the compiled pattern.  The operand for
422            //  the JMP will eventually be the location following the ')' for the
423            //  group.  This will be patched in later, when the ')' is encountered.
424            op = URX_BUILD(URX_JMP, 0);
425            fRXPat->fCompiledPat->addElement(op, *fStatus);
426
427            // Push the position of the newly added JMP op onto the parentheses stack.
428            // This registers if for fixup when this block's close paren is encountered.
429            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
430
431            // Append a NOP to the compiled pattern.  This is the slot reserved
432            //   for a SAVE in the event that there is yet another '|' following
433            //   this one.
434            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
435            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
436        }
437        break;
438
439
440    case doOpenCaptureParen:
441        // Open Paren.
442        //   Compile to a
443        //      - NOP, which later may be replaced by a save-state if the
444        //         parenthesized group gets a * quantifier, followed by
445        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
446        //      - NOP, which may later be replaced by a save-state if there
447        //             is an '|' alternation within the parens.
448        //
449        //    Each capture group gets three slots in the save stack frame:
450        //         0: Capture Group start position (in input string being matched.)
451        //         1: Capture Group end position.
452        //         2: Start of Match-in-progress.
453        //    The first two locations are for a completed capture group, and are
454        //     referred to by back references and the like.
455        //    The third location stores the capture start position when an START_CAPTURE is
456        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
457        //      END_CAPTURE is encountered.
458        {
459            fixLiterals();
460            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
461            int32_t  varsLoc    = fRXPat->fFrameSize;    // Reserve three slots in match stack frame.
462            fRXPat->fFrameSize += 3;
463            int32_t  cop        = URX_BUILD(URX_START_CAPTURE, varsLoc);
464            fRXPat->fCompiledPat->addElement(cop, *fStatus);
465            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
466
467            // On the Parentheses stack, start a new frame and add the postions
468            //   of the two NOPs.  Depending on what follows in the pattern, the
469            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
470            //   address of the end of the parenthesized group.
471            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
472            fParenStack.push(capturing, *fStatus);                        // Frame type.
473            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
474            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
475
476            // Save the mapping from group number to stack frame variable position.
477            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
478        }
479         break;
480
481    case doOpenNonCaptureParen:
482        // Open non-caputuring (grouping only) Paren.
483        //   Compile to a
484        //      - NOP, which later may be replaced by a save-state if the
485        //         parenthesized group gets a * quantifier, followed by
486        //      - NOP, which may later be replaced by a save-state if there
487        //             is an '|' alternation within the parens.
488        {
489            fixLiterals();
490            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
491            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
492
493            // On the Parentheses stack, start a new frame and add the postions
494            //   of the two NOPs.
495            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
496            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
497            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
498            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
499        }
500         break;
501
502
503    case doOpenAtomicParen:
504        // Open Atomic Paren.  (?>
505        //   Compile to a
506        //      - NOP, which later may be replaced if the parenthesized group
507        //         has a quantifier, followed by
508        //      - STO_SP  save state stack position, so it can be restored at the ")"
509        //      - NOP, which may later be replaced by a save-state if there
510        //             is an '|' alternation within the parens.
511        {
512            fixLiterals();
513            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
514            int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the
515            fRXPat->fDataSize += 1;                    //  state stack ptr.
516            int32_t  stoOp     = URX_BUILD(URX_STO_SP, varLoc);
517            fRXPat->fCompiledPat->addElement(stoOp, *fStatus);
518            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
519
520            // On the Parentheses stack, start a new frame and add the postions
521            //   of the two NOPs.  Depending on what follows in the pattern, the
522            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
523            //   address of the end of the parenthesized group.
524            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
525            fParenStack.push(atomic, *fStatus);                           // Frame type.
526            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
527            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
528        }
529        break;
530
531
532    case doOpenLookAhead:
533        // Positive Look-ahead   (?=  stuff  )
534        //
535        //   Note:   Addition of transparent input regions, with the need to
536        //           restore the original regions when failing out of a lookahead
537        //           block, complicated this sequence.  Some conbined opcodes
538        //           might make sense - or might not, lookahead aren't that common.
539        //
540        //      Caution:  min match length optimization knows about this
541        //               sequence; don't change without making updates there too.
542        //
543        // Compiles to
544        //    1    START_LA     dataLoc     Saves SP, Input Pos
545        //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
546        //    3    JMP          6           continue ...
547        //
548        //    4.   LA_END                   Look Ahead failed.  Restore regions.
549        //    5.   BACKTRACK                and back track again.
550        //
551        //    6.   NOP              reserved for use by quantifiers on the block.
552        //                          Look-ahead can't have quantifiers, but paren stack
553        //                             compile time conventions require the slot anyhow.
554        //    7.   NOP              may be replaced if there is are '|' ops in the block.
555        //    8.     code for parenthesized stuff.
556        //    9.   LA_END
557        //
558        //  Two data slots are reserved, for saving the stack ptr and the input position.
559        {
560            fixLiterals();
561            int32_t dataLoc = fRXPat->fDataSize;
562            fRXPat->fDataSize += 2;
563            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
564            fRXPat->fCompiledPat->addElement(op, *fStatus);
565
566            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
567            fRXPat->fCompiledPat->addElement(op, *fStatus);
568
569            op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
570            fRXPat->fCompiledPat->addElement(op, *fStatus);
571
572            op = URX_BUILD(URX_LA_END, dataLoc);
573            fRXPat->fCompiledPat->addElement(op, *fStatus);
574
575            op = URX_BUILD(URX_BACKTRACK, 0);
576            fRXPat->fCompiledPat->addElement(op, *fStatus);
577
578            op = URX_BUILD(URX_NOP, 0);
579            fRXPat->fCompiledPat->addElement(op, *fStatus);
580            fRXPat->fCompiledPat->addElement(op, *fStatus);
581
582            // On the Parentheses stack, start a new frame and add the postions
583            //   of the NOPs.
584            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
585            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
586            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
587            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
588        }
589        break;
590
591    case doOpenLookAheadNeg:
592        // Negated Lookahead.   (?! stuff )
593        // Compiles to
594        //    1.    START_LA    dataloc
595        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
596        //                                //   which continues with the match.
597        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
598        //    4.       code for parenthesized stuff.
599        //    5.    END_LA                // Cut back stack, remove saved state from step 2.
600        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
601        //    7.    END_LA                // Restore match region, in case look-ahead was using
602        //                                        an alternate (transparent) region.
603        {
604            fixLiterals();
605            int32_t dataLoc = fRXPat->fDataSize;
606            fRXPat->fDataSize += 2;
607            int32_t op = URX_BUILD(URX_LA_START, dataLoc);
608            fRXPat->fCompiledPat->addElement(op, *fStatus);
609
610            op = URX_BUILD(URX_STATE_SAVE, 0);    // dest address will be patched later.
611            fRXPat->fCompiledPat->addElement(op, *fStatus);
612
613            op = URX_BUILD(URX_NOP, 0);
614            fRXPat->fCompiledPat->addElement(op, *fStatus);
615
616            // On the Parentheses stack, start a new frame and add the postions
617            //   of the StateSave and NOP.
618            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
619            fParenStack.push(negLookAhead, *fStatus);                    // Frame type
620            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
621            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
622
623            // Instructions #5 - #7 will be added when the ')' is encountered.
624        }
625        break;
626
627    case doOpenLookBehind:
628        {
629            //   Compile a (?<= look-behind open paren.
630            //
631            //          Compiles to
632            //              0       URX_LB_START     dataLoc
633            //              1       URX_LB_CONT      dataLoc
634            //              2                        MinMatchLen
635            //              3                        MaxMatchLen
636            //              4       URX_NOP          Standard '(' boilerplate.
637            //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
638            //              6         <code for LookBehind expression>
639            //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
640            //              8       URX_LA_END       dataLoc    # Restore stack, input pos
641            //
642            //          Allocate a block of matcher data, to contain (when running a match)
643            //              0:    Stack ptr on entry
644            //              1:    Input Index on entry
645            //              2:    Start index of match current match attempt.
646            //              3:    Original Input String len.
647
648            // Generate match code for any pending literals.
649            fixLiterals();
650
651            // Allocate data space
652            int32_t dataLoc = fRXPat->fDataSize;
653            fRXPat->fDataSize += 4;
654
655            // Emit URX_LB_START
656            int32_t op = URX_BUILD(URX_LB_START, dataLoc);
657            fRXPat->fCompiledPat->addElement(op, *fStatus);
658
659            // Emit URX_LB_CONT
660            op = URX_BUILD(URX_LB_CONT, dataLoc);
661            fRXPat->fCompiledPat->addElement(op, *fStatus);
662            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MinMatchLength.  To be filled later.
663            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MaxMatchLength.  To be filled later.
664
665            // Emit the NOP
666            op = URX_BUILD(URX_NOP, 0);
667            fRXPat->fCompiledPat->addElement(op, *fStatus);
668            fRXPat->fCompiledPat->addElement(op, *fStatus);
669
670            // On the Parentheses stack, start a new frame and add the postions
671            //   of the URX_LB_CONT and the NOP.
672            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
673            fParenStack.push(lookBehind, *fStatus);                       // Frame type
674            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
675            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
676
677            // The final two instructions will be added when the ')' is encountered.
678        }
679
680        break;
681
682    case doOpenLookBehindNeg:
683        {
684            //   Compile a (?<! negated look-behind open paren.
685            //
686            //          Compiles to
687            //              0       URX_LB_START     dataLoc    # Save entry stack, input len
688            //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
689            //              2                        MinMatchLen
690            //              3                        MaxMatchLen
691            //              4                        continueLoc (9)
692            //              5       URX_NOP          Standard '(' boilerplate.
693            //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
694            //              7         <code for LookBehind expression>
695            //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
696            //              9       ...
697            //
698            //          Allocate a block of matcher data, to contain (when running a match)
699            //              0:    Stack ptr on entry
700            //              1:    Input Index on entry
701            //              2:    Start index of match current match attempt.
702            //              3:    Original Input String len.
703
704            // Generate match code for any pending literals.
705            fixLiterals();
706
707            // Allocate data space
708            int32_t dataLoc = fRXPat->fDataSize;
709            fRXPat->fDataSize += 4;
710
711            // Emit URX_LB_START
712            int32_t op = URX_BUILD(URX_LB_START, dataLoc);
713            fRXPat->fCompiledPat->addElement(op, *fStatus);
714
715            // Emit URX_LBN_CONT
716            op = URX_BUILD(URX_LBN_CONT, dataLoc);
717            fRXPat->fCompiledPat->addElement(op, *fStatus);
718            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MinMatchLength.  To be filled later.
719            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // MaxMatchLength.  To be filled later.
720            fRXPat->fCompiledPat->addElement(0,  *fStatus);    // Continue Loc.    To be filled later.
721
722            // Emit the NOP
723            op = URX_BUILD(URX_NOP, 0);
724            fRXPat->fCompiledPat->addElement(op, *fStatus);
725            fRXPat->fCompiledPat->addElement(op, *fStatus);
726
727            // On the Parentheses stack, start a new frame and add the postions
728            //   of the URX_LB_CONT and the NOP.
729            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
730            fParenStack.push(lookBehindN, *fStatus);                      // Frame type
731            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
732            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
733
734            // The final two instructions will be added when the ')' is encountered.
735        }
736        break;
737
738    case doConditionalExpr:
739        // Conditionals such as (?(1)a:b)
740    case doPerlInline:
741        // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
742        error(U_REGEX_UNIMPLEMENTED);
743        break;
744
745
746    case doCloseParen:
747        handleCloseParen();
748        if (fParenStack.size() <= 0) {
749            //  Extra close paren, or missing open paren.
750            error(U_REGEX_MISMATCHED_PAREN);
751        }
752        break;
753
754    case doNOP:
755        break;
756
757
758    case doBadOpenParenType:
759    case doRuleError:
760        error(U_REGEX_RULE_SYNTAX);
761        break;
762
763
764    case doMismatchedParenErr:
765        error(U_REGEX_MISMATCHED_PAREN);
766        break;
767
768    case doPlus:
769        //  Normal '+'  compiles to
770        //     1.   stuff to be repeated  (already built)
771        //     2.   jmp-sav 1
772        //     3.   ...
773        //
774        //  Or, if the item to be repeated can match a zero length string,
775        //     1.   STO_INP_LOC  data-loc
776        //     2.      body of stuff to be repeated
777        //     3.   JMP_SAV_X    2
778        //     4.   ...
779
780        //
781        //  Or, if the item to be repeated is simple
782        //     1.   Item to be repeated.
783        //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
784        //     3.   LOOP_C       stack location
785        {
786            int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
787            int32_t  frameLoc;
788
789            // Check for simple constructs, which may get special optimized code.
790            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
791                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
792
793                if (URX_TYPE(repeatedOp) == URX_SETREF) {
794                    // Emit optimized code for [char set]+
795                    int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
796                    fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
797                    frameLoc = fRXPat->fFrameSize;
798                    fRXPat->fFrameSize++;
799                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
800                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
801                    break;
802                }
803
804                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
805                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
806                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
807                    // Emit Optimized code for .+ operations.
808                    int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
809                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
810                        // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
811                        loopOpI |= 1;
812                    }
813                    if (fModeFlags & UREGEX_UNIX_LINES) {
814                        loopOpI |= 2;
815                    }
816                    fRXPat->fCompiledPat->addElement(loopOpI, *fStatus);
817                    frameLoc = fRXPat->fFrameSize;
818                    fRXPat->fFrameSize++;
819                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc);
820                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
821                    break;
822                }
823
824            }
825
826            // General case.
827
828            // Check for minimum match length of zero, which requires
829            //    extra loop-breaking code.
830            if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
831                // Zero length match is possible.
832                // Emit the code sequence that can handle it.
833                insertOp(topLoc);
834                frameLoc =  fRXPat->fFrameSize;
835                fRXPat->fFrameSize++;
836
837                int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc);
838                fRXPat->fCompiledPat->setElementAt(op, topLoc);
839
840                op = URX_BUILD(URX_JMP_SAV_X, topLoc+1);
841                fRXPat->fCompiledPat->addElement(op, *fStatus);
842            } else {
843                // Simpler code when the repeated body must match something non-empty
844                int32_t  jmpOp  = URX_BUILD(URX_JMP_SAV, topLoc);
845                fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
846            }
847        }
848        break;
849
850    case doNGPlus:
851        //  Non-greedy '+?'  compiles to
852        //     1.   stuff to be repeated  (already built)
853        //     2.   state-save  1
854        //     3.   ...
855        {
856            int32_t topLoc      = blockTopLoc(FALSE);
857            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc);
858            fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus);
859        }
860        break;
861
862
863    case doOpt:
864        // Normal (greedy) ? quantifier.
865        //  Compiles to
866        //     1. state save 3
867        //     2.    body of optional block
868        //     3. ...
869        // Insert the state save into the compiled pattern, and we're done.
870        {
871            int32_t   saveStateLoc = blockTopLoc(TRUE);
872            int32_t   saveStateOp  = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
873            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
874        }
875        break;
876
877    case doNGOpt:
878        // Non-greedy ?? quantifier
879        //   compiles to
880        //    1.  jmp   4
881        //    2.     body of optional block
882        //    3   jmp   5
883        //    4.  state save 2
884        //    5    ...
885        //  This code is less than ideal, with two jmps instead of one, because we can only
886        //  insert one instruction at the top of the block being iterated.
887        {
888            int32_t  jmp1_loc = blockTopLoc(TRUE);
889            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
890
891            int32_t  jmp1_op  = URX_BUILD(URX_JMP, jmp2_loc+1);
892            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
893
894            int32_t  jmp2_op  = URX_BUILD(URX_JMP, jmp2_loc+2);
895            fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
896
897            int32_t  save_op  = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
898            fRXPat->fCompiledPat->addElement(save_op, *fStatus);
899        }
900        break;
901
902
903    case doStar:
904        // Normal (greedy) * quantifier.
905        // Compiles to
906        //       1.   STATE_SAVE   4
907        //       2.      body of stuff being iterated over
908        //       3.   JMP_SAV      2
909        //       4.   ...
910        //
911        // Or, if the body is a simple [Set],
912        //       1.   LOOP_SR_I    set number
913        //       2.   LOOP_C       stack location
914        //       ...
915        //
916        // Or if this is a .*
917        //       1.   LOOP_DOT_I    (. matches all mode flag)
918        //       2.   LOOP_C        stack location
919        //
920        // Or, if the body can match a zero-length string, to inhibit infinite loops,
921        //       1.   STATE_SAVE   5
922        //       2.   STO_INP_LOC  data-loc
923        //       3.      body of stuff
924        //       4.   JMP_SAV_X    2
925        //       5.   ...
926        {
927            // location of item #1, the STATE_SAVE
928            int32_t   topLoc = blockTopLoc(FALSE);
929            int32_t   dataLoc = -1;
930
931            // Check for simple *, where the construct being repeated
932            //   compiled to single opcode, and might be optimizable.
933            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
934                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
935
936                if (URX_TYPE(repeatedOp) == URX_SETREF) {
937                    // Emit optimized code for a [char set]*
938                    int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
939                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
940                    dataLoc = fRXPat->fFrameSize;
941                    fRXPat->fFrameSize++;
942                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
943                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
944                    break;
945                }
946
947                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
948                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
949                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
950                    // Emit Optimized code for .* operations.
951                    int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0);
952                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
953                        // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
954                        loopOpI |= 1;
955                    }
956                    if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
957                        loopOpI |= 2;
958                    }
959                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
960                    dataLoc = fRXPat->fFrameSize;
961                    fRXPat->fFrameSize++;
962                    int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc);
963                    fRXPat->fCompiledPat->addElement(loopOpC, *fStatus);
964                    break;
965                }
966            }
967
968            // Emit general case code for this *
969            // The optimizations did not apply.
970
971            int32_t   saveStateLoc = blockTopLoc(TRUE);
972            int32_t   jmpOp        = URX_BUILD(URX_JMP_SAV, saveStateLoc+1);
973
974            // Check for minimum match length of zero, which requires
975            //    extra loop-breaking code.
976            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
977                insertOp(saveStateLoc);
978                dataLoc =  fRXPat->fFrameSize;
979                fRXPat->fFrameSize++;
980
981                int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc);
982                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
983                jmpOp      = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
984            }
985
986            // Locate the position in the compiled pattern where the match will continue
987            //   after completing the *.   (4 or 5 in the comment above)
988            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
989
990            // Put together the save state op store it into the compiled code.
991            int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc);
992            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
993
994            // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
995            fRXPat->fCompiledPat->addElement(jmpOp, *fStatus);
996        }
997        break;
998
999    case doNGStar:
1000        // Non-greedy *? quantifier
1001        // compiles to
1002        //     1.   JMP    3
1003        //     2.      body of stuff being iterated over
1004        //     3.   STATE_SAVE  2
1005        //     4    ...
1006        {
1007            int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
1008            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
1009            int32_t     jmpOp   = URX_BUILD(URX_JMP, saveLoc);
1010            int32_t     stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
1011            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1012            fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
1013        }
1014        break;
1015
1016
1017    case doIntervalInit:
1018        // The '{' opening an interval quantifier was just scanned.
1019        // Init the counter varaiables that will accumulate the values as the digits
1020        //    are scanned.
1021        fIntervalLow = 0;
1022        fIntervalUpper = -1;
1023        break;
1024
1025    case doIntevalLowerDigit:
1026        // Scanned a digit from the lower value of an {lower,upper} interval
1027        {
1028            int32_t digitValue = u_charDigitValue(fC.fChar);
1029            U_ASSERT(digitValue >= 0);
1030            fIntervalLow = fIntervalLow*10 + digitValue;
1031            if (fIntervalLow < 0) {
1032                error(U_REGEX_NUMBER_TOO_BIG);
1033            }
1034        }
1035        break;
1036
1037    case doIntervalUpperDigit:
1038        // Scanned a digit from the upper value of an {lower,upper} interval
1039        {
1040            if (fIntervalUpper < 0) {
1041                fIntervalUpper = 0;
1042            }
1043            int32_t digitValue = u_charDigitValue(fC.fChar);
1044            U_ASSERT(digitValue >= 0);
1045            fIntervalUpper = fIntervalUpper*10 + digitValue;
1046            if (fIntervalUpper < 0) {
1047                error(U_REGEX_NUMBER_TOO_BIG);
1048            }
1049        }
1050        break;
1051
1052    case doIntervalSame:
1053        // Scanned a single value interval like {27}.  Upper = Lower.
1054        fIntervalUpper = fIntervalLow;
1055        break;
1056
1057    case doInterval:
1058        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1059        if (compileInlineInterval() == FALSE) {
1060            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1061        }
1062        break;
1063
1064    case doPossessiveInterval:
1065        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1066        {
1067            // Remember the loc for the top of the block being looped over.
1068            //   (Can not reserve a slot in the compiled pattern at this time, because
1069            //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1070            //    once per block.)
1071            int32_t topLoc = blockTopLoc(FALSE);
1072
1073            // Produce normal looping code.
1074            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1075
1076            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1077            //  just as if the loop was inclosed in atomic parentheses.
1078
1079            // First the STO_SP before the start of the loop
1080            insertOp(topLoc);
1081            int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the
1082            fRXPat->fDataSize += 1;                    //  state stack ptr.
1083            int32_t  op        = URX_BUILD(URX_STO_SP, varLoc);
1084            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1085
1086            int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1087            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1088            loopOp++;     // point LoopOp after the just-inserted STO_SP
1089            fRXPat->fCompiledPat->push(loopOp, *fStatus);
1090
1091            // Then the LD_SP after the end of the loop
1092            op = URX_BUILD(URX_LD_SP, varLoc);
1093            fRXPat->fCompiledPat->addElement(op, *fStatus);
1094        }
1095
1096        break;
1097
1098    case doNGInterval:
1099        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1100        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1101        break;
1102
1103    case doIntervalError:
1104        error(U_REGEX_BAD_INTERVAL);
1105        break;
1106
1107    case doLiteralChar:
1108        // We've just scanned a "normal" character from the pattern,
1109        literalChar(fC.fChar);
1110        break;
1111
1112
1113    case doEscapedLiteralChar:
1114        // We've just scanned an backslashed escaped character with  no
1115        //   special meaning.  It represents itself.
1116        if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1117            ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1118            (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1119               error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1120             }
1121        literalChar(fC.fChar);
1122        break;
1123
1124
1125    case doDotAny:
1126        // scanned a ".",  match any single character.
1127        {
1128            fixLiterals(FALSE);
1129            int32_t   op;
1130            if (fModeFlags & UREGEX_DOTALL) {
1131                op = URX_BUILD(URX_DOTANY_ALL, 0);
1132            } else if (fModeFlags & UREGEX_UNIX_LINES) {
1133                op = URX_BUILD(URX_DOTANY_UNIX, 0);
1134            } else {
1135                op = URX_BUILD(URX_DOTANY, 0);
1136            }
1137            fRXPat->fCompiledPat->addElement(op, *fStatus);
1138        }
1139        break;
1140
1141    case doCaret:
1142        {
1143            fixLiterals(FALSE);
1144            int32_t op = 0;
1145            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1146                op = URX_CARET;
1147            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1148                op = URX_CARET_M;
1149            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1150                op = URX_CARET;   // Only testing true start of input.
1151            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1152                op = URX_CARET_M_UNIX;
1153            }
1154            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1155        }
1156        break;
1157
1158    case doDollar:
1159        {
1160            fixLiterals(FALSE);
1161            int32_t op = 0;
1162            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1163                op = URX_DOLLAR;
1164            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1165                op = URX_DOLLAR_M;
1166            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1167                op = URX_DOLLAR_D;
1168            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1169                op = URX_DOLLAR_MD;
1170            }
1171            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1172        }
1173        break;
1174
1175    case doBackslashA:
1176        fixLiterals(FALSE);
1177        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
1178        break;
1179
1180    case doBackslashB:
1181        {
1182            #if  UCONFIG_NO_BREAK_ITERATION==1
1183            if (fModeFlags & UREGEX_UWORD) {
1184                error(U_UNSUPPORTED_ERROR);
1185            }
1186            #endif
1187            fixLiterals(FALSE);
1188            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1189            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
1190        }
1191        break;
1192
1193    case doBackslashb:
1194        {
1195            #if  UCONFIG_NO_BREAK_ITERATION==1
1196            if (fModeFlags & UREGEX_UWORD) {
1197                error(U_UNSUPPORTED_ERROR);
1198            }
1199            #endif
1200            fixLiterals(FALSE);
1201            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1202            fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
1203        }
1204        break;
1205
1206    case doBackslashD:
1207        fixLiterals(FALSE);
1208        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus);
1209        break;
1210
1211    case doBackslashd:
1212        fixLiterals(FALSE);
1213        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus);
1214        break;
1215
1216    case doBackslashG:
1217        fixLiterals(FALSE);
1218        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
1219        break;
1220
1221    case doBackslashS:
1222        fixLiterals(FALSE);
1223        fRXPat->fCompiledPat->addElement(
1224            URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus);
1225        break;
1226
1227    case doBackslashs:
1228        fixLiterals(FALSE);
1229        fRXPat->fCompiledPat->addElement(
1230            URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus);
1231        break;
1232
1233    case doBackslashW:
1234        fixLiterals(FALSE);
1235        fRXPat->fCompiledPat->addElement(
1236            URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus);
1237        break;
1238
1239    case doBackslashw:
1240        fixLiterals(FALSE);
1241        fRXPat->fCompiledPat->addElement(
1242            URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
1243        break;
1244
1245    case doBackslashX:
1246        fixLiterals(FALSE);
1247        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
1248        break;
1249
1250
1251    case doBackslashZ:
1252        fixLiterals(FALSE);
1253        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
1254        break;
1255
1256    case doBackslashz:
1257        fixLiterals(FALSE);
1258        fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
1259        break;
1260
1261    case doEscapeError:
1262        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1263        break;
1264
1265    case doExit:
1266        fixLiterals(FALSE);
1267        returnVal = FALSE;
1268        break;
1269
1270    case doProperty:
1271        {
1272            fixLiterals(FALSE);
1273            UnicodeSet *theSet = scanProp();
1274            compileSet(theSet);
1275        }
1276        break;
1277
1278    case doNamedChar:
1279        {
1280            UChar32 c = scanNamedChar();
1281            literalChar(c);
1282        }
1283        break;
1284
1285
1286    case doBackRef:
1287        // BackReference.  Somewhat unusual in that the front-end can not completely parse
1288        //                 the regular expression, because the number of digits to be consumed
1289        //                 depends on the number of capture groups that have been defined.  So
1290        //                 we have to do it here instead.
1291        {
1292            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1293            int32_t  groupNum = 0;
1294            UChar32  c        = fC.fChar;
1295
1296            for (;;) {
1297                // Loop once per digit, for max allowed number of digits in a back reference.
1298                int32_t digit = u_charDigitValue(c);
1299                groupNum = groupNum * 10 + digit;
1300                if (groupNum >= numCaptureGroups) {
1301                    break;
1302                }
1303                c = peekCharLL();
1304                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1305                    break;
1306                }
1307                nextCharLL();
1308            }
1309
1310            // Scan of the back reference in the source regexp is complete.  Now generate
1311            //  the compiled code for it.
1312            // Because capture groups can be forward-referenced by back-references,
1313            //  we fill the operand with the capture group number.  At the end
1314            //  of compilation, it will be changed to the variable's location.
1315            U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1316                                     //    and shouldn't enter this code path at all.
1317            fixLiterals(FALSE);
1318            int32_t  op;
1319            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1320                op = URX_BUILD(URX_BACKREF_I, groupNum);
1321            } else {
1322                op = URX_BUILD(URX_BACKREF, groupNum);
1323            }
1324            fRXPat->fCompiledPat->addElement(op, *fStatus);
1325        }
1326        break;
1327
1328
1329    case doPossessivePlus:
1330        // Possessive ++ quantifier.
1331        // Compiles to
1332        //       1.   STO_SP
1333        //       2.      body of stuff being iterated over
1334        //       3.   STATE_SAVE 5
1335        //       4.   JMP        2
1336        //       5.   LD_SP
1337        //       6.   ...
1338        //
1339        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1340        //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1341        //
1342        {
1343            // Emit the STO_SP
1344            int32_t   topLoc = blockTopLoc(TRUE);
1345            int32_t   stoLoc = fRXPat->fDataSize;
1346            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
1347            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
1348            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1349
1350            // Emit the STATE_SAVE
1351            op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1352            fRXPat->fCompiledPat->addElement(op, *fStatus);
1353
1354            // Emit the JMP
1355            op = URX_BUILD(URX_JMP, topLoc+1);
1356            fRXPat->fCompiledPat->addElement(op, *fStatus);
1357
1358            // Emit the LD_SP
1359            op = URX_BUILD(URX_LD_SP, stoLoc);
1360            fRXPat->fCompiledPat->addElement(op, *fStatus);
1361        }
1362        break;
1363
1364    case doPossessiveStar:
1365        // Possessive *+ quantifier.
1366        // Compiles to
1367        //       1.   STO_SP       loc
1368        //       2.   STATE_SAVE   5
1369        //       3.      body of stuff being iterated over
1370        //       4.   JMP          2
1371        //       5.   LD_SP        loc
1372        //       6    ...
1373        // TODO:  do something to cut back the state stack each time through the loop.
1374        {
1375            // Reserve two slots at the top of the block.
1376            int32_t   topLoc = blockTopLoc(TRUE);
1377            insertOp(topLoc);
1378
1379            // emit   STO_SP     loc
1380            int32_t   stoLoc = fRXPat->fDataSize;
1381            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
1382            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
1383            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1384
1385            // Emit the SAVE_STATE   5
1386            int32_t L7 = fRXPat->fCompiledPat->size()+1;
1387            op = URX_BUILD(URX_STATE_SAVE, L7);
1388            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1389
1390            // Append the JMP operation.
1391            op = URX_BUILD(URX_JMP, topLoc+1);
1392            fRXPat->fCompiledPat->addElement(op, *fStatus);
1393
1394            // Emit the LD_SP       loc
1395            op = URX_BUILD(URX_LD_SP, stoLoc);
1396            fRXPat->fCompiledPat->addElement(op, *fStatus);
1397        }
1398        break;
1399
1400    case doPossessiveOpt:
1401        // Possessive  ?+ quantifier.
1402        //  Compiles to
1403        //     1. STO_SP      loc
1404        //     2. SAVE_STATE  5
1405        //     3.    body of optional block
1406        //     4. LD_SP       loc
1407        //     5. ...
1408        //
1409        {
1410            // Reserve two slots at the top of the block.
1411            int32_t   topLoc = blockTopLoc(TRUE);
1412            insertOp(topLoc);
1413
1414            // Emit the STO_SP
1415            int32_t   stoLoc = fRXPat->fDataSize;
1416            fRXPat->fDataSize++;       // Reserve the data location for storing save stack ptr.
1417            int32_t   op     = URX_BUILD(URX_STO_SP, stoLoc);
1418            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1419
1420            // Emit the SAVE_STATE
1421            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1422            op = URX_BUILD(URX_STATE_SAVE, continueLoc);
1423            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1424
1425            // Emit the LD_SP
1426            op = URX_BUILD(URX_LD_SP, stoLoc);
1427            fRXPat->fCompiledPat->addElement(op, *fStatus);
1428        }
1429        break;
1430
1431
1432    case doBeginMatchMode:
1433        fNewModeFlags = fModeFlags;
1434        fSetModeFlag  = TRUE;
1435        break;
1436
1437    case doMatchMode:   //  (?i)    and similar
1438        {
1439            int32_t  bit = 0;
1440            switch (fC.fChar) {
1441            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1442            case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1443            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1444            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1445            case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1446            case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1447            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1448            case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1449            default:
1450                U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
1451                                   // by the scanner.
1452            }
1453            if (fSetModeFlag) {
1454                fNewModeFlags |= bit;
1455            } else {
1456                fNewModeFlags &= ~bit;
1457            }
1458        }
1459        break;
1460
1461    case doSetMatchMode:
1462        // Emit code to match any pending literals, using the not-yet changed match mode.
1463        fixLiterals();
1464
1465        // We've got a (?i) or similar.  The match mode is being changed, but
1466        //   the change is not scoped to a parenthesized block.
1467        U_ASSERT(fNewModeFlags < 0);
1468        fModeFlags = fNewModeFlags;
1469
1470        break;
1471
1472
1473    case doMatchModeParen:
1474        // We've got a (?i: or similar.  Begin a parenthesized block, save old
1475        //   mode flags so they can be restored at the close of the block.
1476        //
1477        //   Compile to a
1478        //      - NOP, which later may be replaced by a save-state if the
1479        //         parenthesized group gets a * quantifier, followed by
1480        //      - NOP, which may later be replaced by a save-state if there
1481        //             is an '|' alternation within the parens.
1482        {
1483            fixLiterals(FALSE);
1484            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1485            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
1486
1487            // On the Parentheses stack, start a new frame and add the postions
1488            //   of the two NOPs (a normal non-capturing () frame, except for the
1489            //   saving of the orignal mode flags.)
1490            fParenStack.push(fModeFlags, *fStatus);
1491            fParenStack.push(flags, *fStatus);                            // Frame Marker
1492            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1493            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1494
1495            // Set the current mode flags to the new values.
1496            U_ASSERT(fNewModeFlags < 0);
1497            fModeFlags = fNewModeFlags;
1498        }
1499        break;
1500
1501    case doBadModeFlag:
1502        error(U_REGEX_INVALID_FLAG);
1503        break;
1504
1505    case doSuppressComments:
1506        // We have just scanned a '(?'.  We now need to prevent the character scanner from
1507        // treating a '#' as a to-the-end-of-line comment.
1508        //   (This Perl compatibility just gets uglier and uglier to do...)
1509        fEOLComments = FALSE;
1510        break;
1511
1512
1513    case doSetAddAmp:
1514        {
1515          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1516          set->add(chAmp);
1517        }
1518        break;
1519
1520    case doSetAddDash:
1521        {
1522          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1523          set->add(chDash);
1524        }
1525        break;
1526
1527     case doSetBackslash_s:
1528        {
1529         UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1530         set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1531         break;
1532        }
1533
1534     case doSetBackslash_S:
1535        {
1536            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1537            UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1538            SSet.complement();
1539            set->addAll(SSet);
1540            break;
1541        }
1542
1543    case doSetBackslash_d:
1544        {
1545            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1546            // TODO - make a static set, ticket 6058.
1547            addCategory(set, U_GC_ND_MASK, *fStatus);
1548            break;
1549        }
1550
1551    case doSetBackslash_D:
1552        {
1553            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554            UnicodeSet digits;
1555            // TODO - make a static set, ticket 6058.
1556            digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1557            digits.complement();
1558            set->addAll(digits);
1559            break;
1560        }
1561
1562    case doSetBackslash_w:
1563        {
1564            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1565            set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1566            break;
1567        }
1568
1569    case doSetBackslash_W:
1570        {
1571            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1572            UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1573            SSet.complement();
1574            set->addAll(SSet);
1575            break;
1576        }
1577
1578    case doSetBegin:
1579        fixLiterals(FALSE);
1580        fSetStack.push(new UnicodeSet(), *fStatus);
1581        fSetOpStack.push(setStart, *fStatus);
1582        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1583            fSetOpStack.push(setCaseClose, *fStatus);
1584        }
1585        break;
1586
1587    case doSetBeginDifference1:
1588        //  We have scanned something like [[abc]-[
1589        //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1590        //  Push a Difference operator, which will cause the new set to be subtracted from what
1591        //    went before once it is created.
1592        setPushOp(setDifference1);
1593        fSetOpStack.push(setStart, *fStatus);
1594        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1595            fSetOpStack.push(setCaseClose, *fStatus);
1596        }
1597        break;
1598
1599    case doSetBeginIntersection1:
1600        //  We have scanned something like  [[abc]&[
1601        //   Need both the '&' operator and the open '[' operator.
1602        setPushOp(setIntersection1);
1603        fSetOpStack.push(setStart, *fStatus);
1604        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1605            fSetOpStack.push(setCaseClose, *fStatus);
1606        }
1607        break;
1608
1609    case doSetBeginUnion:
1610        //  We have scanned something like  [[abc][
1611        //     Need to handle the union operation explicitly [[abc] | [
1612        setPushOp(setUnion);
1613        fSetOpStack.push(setStart, *fStatus);
1614        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1615            fSetOpStack.push(setCaseClose, *fStatus);
1616        }
1617        break;
1618
1619    case doSetDifference2:
1620        // We have scanned something like [abc--
1621        //   Consider this to unambiguously be a set difference operator.
1622        setPushOp(setDifference2);
1623        break;
1624
1625    case doSetEnd:
1626        // Have encountered the ']' that closes a set.
1627        //    Force the evaluation of any pending operations within this set,
1628        //    leave the completed set on the top of the set stack.
1629        setEval(setEnd);
1630        U_ASSERT(fSetOpStack.peeki()==setStart);
1631        fSetOpStack.popi();
1632        break;
1633
1634    case doSetFinish:
1635        {
1636        // Finished a complete set expression, including all nested sets.
1637        //   The close bracket has already triggered clearing out pending set operators,
1638        //    the operator stack should be empty and the operand stack should have just
1639        //    one entry, the result set.
1640        U_ASSERT(fSetOpStack.empty());
1641        UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1642        U_ASSERT(fSetStack.empty());
1643        compileSet(theSet);
1644        break;
1645        }
1646
1647    case doSetIntersection2:
1648        // Have scanned something like [abc&&
1649        setPushOp(setIntersection2);
1650        break;
1651
1652    case doSetLiteral:
1653        // Union the just-scanned literal character into the set being built.
1654        //    This operation is the highest precedence set operation, so we can always do
1655        //    it immediately, without waiting to see what follows.  It is necessary to perform
1656        //    any pending '-' or '&' operation first, because these have the same precedence
1657        //    as union-ing in a literal'
1658        {
1659            setEval(setUnion);
1660            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1661            s->add(fC.fChar);
1662            fLastSetLiteral = fC.fChar;
1663            break;
1664        }
1665
1666    case doSetLiteralEscaped:
1667        // A back-slash escaped literal character was encountered.
1668        // Processing is the same as with setLiteral, above, with the addition of
1669        //  the optional check for errors on escaped ASCII letters.
1670        {
1671            if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1672                ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1673                 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1674                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1675            }
1676            setEval(setUnion);
1677            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1678            s->add(fC.fChar);
1679            fLastSetLiteral = fC.fChar;
1680            break;
1681        }
1682
1683        case doSetNamedChar:
1684        // Scanning a \N{UNICODE CHARACTER NAME}
1685        //  Aside from the source of the character, the processing is identical to doSetLiteral,
1686        //    above.
1687        {
1688            UChar32  c = scanNamedChar();
1689            setEval(setUnion);
1690            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1691            s->add(c);
1692            fLastSetLiteral = c;
1693            break;
1694        }
1695
1696    case doSetNamedRange:
1697        // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1698        // The left character is already in the set, and is saved in fLastSetLiteral.
1699        // The right side needs to be picked up, the scan is at the 'N'.
1700        // Lower Limit > Upper limit being an error matches both Java
1701        //        and ICU UnicodeSet behavior.
1702        {
1703            UChar32  c = scanNamedChar();
1704            if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
1705                error(U_REGEX_INVALID_RANGE);
1706            }
1707            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1708            s->add(fLastSetLiteral, c);
1709            fLastSetLiteral = c;
1710            break;
1711        }
1712
1713
1714    case  doSetNegate:
1715        // Scanned a '^' at the start of a set.
1716        // Push the negation operator onto the set op stack.
1717        // A twist for case-insensitive matching:
1718        //   the case closure operation must happen _before_ negation.
1719        //   But the case closure operation will already be on the stack if it's required.
1720        //   This requires checking for case closure, and swapping the stack order
1721        //    if it is present.
1722        {
1723            int32_t  tosOp = fSetOpStack.peeki();
1724            if (tosOp == setCaseClose) {
1725                fSetOpStack.popi();
1726                fSetOpStack.push(setNegation, *fStatus);
1727                fSetOpStack.push(setCaseClose, *fStatus);
1728            } else {
1729                fSetOpStack.push(setNegation, *fStatus);
1730            }
1731        }
1732        break;
1733
1734    case doSetNoCloseError:
1735        error(U_REGEX_MISSING_CLOSE_BRACKET);
1736        break;
1737
1738    case doSetOpError:
1739        error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1740        break;
1741
1742    case doSetPosixProp:
1743        {
1744            UnicodeSet *s = scanPosixProp();
1745            if (s != NULL) {
1746                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1747                tos->addAll(*s);
1748                delete s;
1749            }  // else error.  scanProp() reported the error status already.
1750        }
1751        break;
1752
1753    case doSetProp:
1754        //  Scanned a \p \P within [brackets].
1755        {
1756            UnicodeSet *s = scanProp();
1757            if (s != NULL) {
1758                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1759                tos->addAll(*s);
1760                delete s;
1761            }  // else error.  scanProp() reported the error status already.
1762        }
1763        break;
1764
1765
1766    case doSetRange:
1767        // We have scanned literal-literal.  Add the range to the set.
1768        // The left character is already in the set, and is saved in fLastSetLiteral.
1769        // The right side is the current character.
1770        // Lower Limit > Upper limit being an error matches both Java
1771        //        and ICU UnicodeSet behavior.
1772        {
1773        if (fLastSetLiteral > fC.fChar) {
1774            error(U_REGEX_INVALID_RANGE);
1775        }
1776        UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1777        s->add(fLastSetLiteral, fC.fChar);
1778        break;
1779        }
1780
1781    default:
1782        U_ASSERT(FALSE);
1783        error(U_REGEX_INTERNAL_ERROR);
1784        break;
1785    }
1786
1787    if (U_FAILURE(*fStatus)) {
1788        returnVal = FALSE;
1789    }
1790
1791    return returnVal;
1792}
1793
1794
1795
1796//------------------------------------------------------------------------------
1797//
1798//   literalChar           We've encountered a literal character from the pattern,
1799//                             or an escape sequence that reduces to a character.
1800//                         Add it to the string containing all literal chars/strings from
1801//                             the pattern.
1802//
1803//------------------------------------------------------------------------------
1804void RegexCompile::literalChar(UChar32 c)  {
1805    fLiteralChars.append(c);
1806}
1807
1808
1809//------------------------------------------------------------------------------
1810//
1811//    fixLiterals           When compiling something that can follow a literal
1812//                          string in a pattern, emit the code to match the
1813//                          accumulated literal string.
1814//
1815//                          Optionally, split the last char of the string off into
1816//                          a single "ONE_CHAR" operation, so that quantifiers can
1817//                          apply to that char alone.  Example:   abc*
1818//                          The * must apply to the 'c' only.
1819//
1820//------------------------------------------------------------------------------
1821void    RegexCompile::fixLiterals(UBool split) {
1822    int32_t  op = 0;                       // An op from/for the compiled pattern.
1823
1824    // If no literal characters have been scanned but not yet had code generated
1825    //   for them, nothing needs to be done.
1826    if (fLiteralChars.length() == 0) {
1827        return;
1828    }
1829
1830    int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1831    UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1832
1833    // Split:  We need to  ensure that the last item in the compiled pattern
1834    //     refers only to the last literal scanned in the pattern, so that
1835    //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1836    //     Split before case folding for case insensitive matches.
1837
1838    if (split) {
1839        fLiteralChars.truncate(indexOfLastCodePoint);
1840        fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1841                              //  Note that the truncated literal string may be empty, in which case
1842                              //  nothing will be emitted.
1843
1844        literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1845        fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1846        return;
1847    }
1848
1849    // If we are doing case-insensitive matching, case fold the string.  This may expand
1850    //   the string, e.g. the German sharp-s turns into "ss"
1851    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1852        fLiteralChars.foldCase();
1853        indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1854        lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1855    }
1856
1857    if (indexOfLastCodePoint == 0) {
1858        // Single character, emit a URX_ONECHAR op to match it.
1859        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1860                 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1861            op = URX_BUILD(URX_ONECHAR_I, lastCodePoint);
1862        } else {
1863            op = URX_BUILD(URX_ONECHAR, lastCodePoint);
1864        }
1865        fRXPat->fCompiledPat->addElement(op, *fStatus);
1866    } else {
1867        // Two or more chars, emit a URX_STRING to match them.
1868        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1869            op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length());
1870        } else {
1871            // TODO here:  add optimization to split case sensitive strings of length two
1872            //             into two single char ops, for efficiency.
1873            op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length());
1874        }
1875        fRXPat->fCompiledPat->addElement(op, *fStatus);
1876        op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length());
1877        fRXPat->fCompiledPat->addElement(op, *fStatus);
1878
1879        // Add this string into the accumulated strings of the compiled pattern.
1880        fRXPat->fLiteralText.append(fLiteralChars);
1881    }
1882
1883    fLiteralChars.remove();
1884}
1885
1886
1887
1888
1889
1890
1891//------------------------------------------------------------------------------
1892//
1893//   insertOp()             Insert a slot for a new opcode into the already
1894//                          compiled pattern code.
1895//
1896//                          Fill the slot with a NOP.  Our caller will replace it
1897//                          with what they really wanted.
1898//
1899//------------------------------------------------------------------------------
1900void   RegexCompile::insertOp(int32_t where) {
1901    UVector64 *code = fRXPat->fCompiledPat;
1902    U_ASSERT(where>0 && where < code->size());
1903
1904    int32_t  nop = URX_BUILD(URX_NOP, 0);
1905    code->insertElementAt(nop, where, *fStatus);
1906
1907    // Walk through the pattern, looking for any ops with targets that
1908    //  were moved down by the insert.  Fix them.
1909    int32_t loc;
1910    for (loc=0; loc<code->size(); loc++) {
1911        int32_t op = (int32_t)code->elementAti(loc);
1912        int32_t opType = URX_TYPE(op);
1913        int32_t opValue = URX_VAL(op);
1914        if ((opType == URX_JMP         ||
1915            opType == URX_JMPX         ||
1916            opType == URX_STATE_SAVE   ||
1917            opType == URX_CTR_LOOP     ||
1918            opType == URX_CTR_LOOP_NG  ||
1919            opType == URX_JMP_SAV      ||
1920            opType == URX_JMP_SAV_X    ||
1921            opType == URX_RELOC_OPRND)    && opValue > where) {
1922            // Target location for this opcode is after the insertion point and
1923            //   needs to be incremented to adjust for the insertion.
1924            opValue++;
1925            op = URX_BUILD(opType, opValue);
1926            code->setElementAt(op, loc);
1927        }
1928    }
1929
1930    // Now fix up the parentheses stack.  All positive values in it are locations in
1931    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
1932    for (loc=0; loc<fParenStack.size(); loc++) {
1933        int32_t x = fParenStack.elementAti(loc);
1934        U_ASSERT(x < code->size());
1935        if (x>where) {
1936            x++;
1937            fParenStack.setElementAt(x, loc);
1938        }
1939    }
1940
1941    if (fMatchCloseParen > where) {
1942        fMatchCloseParen++;
1943    }
1944    if (fMatchOpenParen > where) {
1945        fMatchOpenParen++;
1946    }
1947}
1948
1949
1950
1951//------------------------------------------------------------------------------
1952//
1953//   blockTopLoc()          Find or create a location in the compiled pattern
1954//                          at the start of the operation or block that has
1955//                          just been compiled.  Needed when a quantifier (* or
1956//                          whatever) appears, and we need to add an operation
1957//                          at the start of the thing being quantified.
1958//
1959//                          (Parenthesized Blocks) have a slot with a NOP that
1960//                          is reserved for this purpose.  .* or similar don't
1961//                          and a slot needs to be added.
1962//
1963//       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
1964//                                         at the returned location.
1965//                                 FALSE - just return the address,
1966//                                         do not reserve a location there.
1967//
1968//------------------------------------------------------------------------------
1969int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
1970    int32_t   theLoc;
1971    fixLiterals(TRUE);  // Emit code for any pending literals.
1972                        //   If last item was a string, emit separate op for the its last char.
1973    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
1974    {
1975        // The item just processed is a parenthesized block.
1976        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
1977        U_ASSERT(theLoc > 0);
1978        U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
1979    }
1980    else {
1981        // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
1982        // No slot for STATE_SAVE was pre-reserved in the compiled code.
1983        // We need to make space now.
1984        theLoc = fRXPat->fCompiledPat->size()-1;
1985        int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
1986        if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
1987            // Strings take two opcode, we want the position of the first one.
1988            // We can have a string at this point if a single character case-folded to two.
1989            theLoc--;
1990        }
1991        if (reserveLoc) {
1992            int32_t  nop = URX_BUILD(URX_NOP, 0);
1993            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
1994        }
1995    }
1996    return theLoc;
1997}
1998
1999
2000
2001//------------------------------------------------------------------------------
2002//
2003//    handleCloseParen      When compiling a close paren, we need to go back
2004//                          and fix up any JMP or SAVE operations within the
2005//                          parenthesized block that need to target the end
2006//                          of the block.  The locations of these are kept on
2007//                          the paretheses stack.
2008//
2009//                          This function is called both when encountering a
2010//                          real ) and at the end of the pattern.
2011//
2012//------------------------------------------------------------------------------
2013void  RegexCompile::handleCloseParen() {
2014    int32_t   patIdx;
2015    int32_t   patOp;
2016    if (fParenStack.size() <= 0) {
2017        error(U_REGEX_MISMATCHED_PAREN);
2018        return;
2019    }
2020
2021    // Emit code for any pending literals.
2022    fixLiterals(FALSE);
2023
2024    // Fixup any operations within the just-closed parenthesized group
2025    //    that need to reference the end of the (block).
2026    //    (The first one popped from the stack is an unused slot for
2027    //     alternation (OR) state save, but applying the fixup to it does no harm.)
2028    for (;;) {
2029        patIdx = fParenStack.popi();
2030        if (patIdx < 0) {
2031            // value < 0 flags the start of the frame on the paren stack.
2032            break;
2033        }
2034        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2035        patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2036        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2037        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2038        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2039        fMatchOpenParen     = patIdx;
2040    }
2041
2042    //  At the close of any parenthesized block, restore the match mode flags  to
2043    //  the value they had at the open paren.  Saved value is
2044    //  at the top of the paren stack.
2045    fModeFlags = fParenStack.popi();
2046    U_ASSERT(fModeFlags < 0);
2047
2048    // DO any additional fixups, depending on the specific kind of
2049    // parentesized grouping this is
2050
2051    switch (patIdx) {
2052    case plain:
2053    case flags:
2054        // No additional fixups required.
2055        //   (Grouping-only parentheses)
2056        break;
2057    case capturing:
2058        // Capturing Parentheses.
2059        //   Insert a End Capture op into the pattern.
2060        //   The frame offset of the variables for this cg is obtained from the
2061        //       start capture op and put it into the end-capture op.
2062        {
2063            int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2064            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2065
2066            int32_t   frameVarLocation = URX_VAL(captureOp);
2067            int32_t   endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation);
2068            fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus);
2069        }
2070        break;
2071    case atomic:
2072        // Atomic Parenthesis.
2073        //   Insert a LD_SP operation to restore the state stack to the position
2074        //   it was when the atomic parens were entered.
2075        {
2076            int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2077            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2078            int32_t   stoLoc = URX_VAL(stoOp);
2079            int32_t   ldOp   = URX_BUILD(URX_LD_SP, stoLoc);
2080            fRXPat->fCompiledPat->addElement(ldOp, *fStatus);
2081        }
2082        break;
2083
2084    case lookAhead:
2085        {
2086            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2087            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2088            int32_t dataLoc  = URX_VAL(startOp);
2089            int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
2090            fRXPat->fCompiledPat->addElement(op, *fStatus);
2091        }
2092        break;
2093
2094    case negLookAhead:
2095        {
2096            // See comment at doOpenLookAheadNeg
2097            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2098            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2099            int32_t dataLoc  = URX_VAL(startOp);
2100            int32_t op       = URX_BUILD(URX_LA_END, dataLoc);
2101            fRXPat->fCompiledPat->addElement(op, *fStatus);
2102            op               = URX_BUILD(URX_BACKTRACK, 0);
2103            fRXPat->fCompiledPat->addElement(op, *fStatus);
2104            op               = URX_BUILD(URX_LA_END, dataLoc);
2105            fRXPat->fCompiledPat->addElement(op, *fStatus);
2106
2107            // Patch the URX_SAVE near the top of the block.
2108            // The destination of the SAVE is the final LA_END that was just added.
2109            int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2110            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2111            int32_t dest     = fRXPat->fCompiledPat->size()-1;
2112            saveOp           = URX_BUILD(URX_STATE_SAVE, dest);
2113            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2114        }
2115        break;
2116
2117    case lookBehind:
2118        {
2119            // See comment at doOpenLookBehind.
2120
2121            // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2122            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2123            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2124            int32_t dataLoc  = URX_VAL(startOp);
2125            int32_t op       = URX_BUILD(URX_LB_END, dataLoc);
2126            fRXPat->fCompiledPat->addElement(op, *fStatus);
2127                    op       = URX_BUILD(URX_LA_END, dataLoc);
2128            fRXPat->fCompiledPat->addElement(op, *fStatus);
2129
2130            // Determine the min and max bounds for the length of the
2131            //  string that the pattern can match.
2132            //  An unbounded upper limit is an error.
2133            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2134            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2135            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2136            if (maxML == INT32_MAX) {
2137                error(U_REGEX_LOOK_BEHIND_LIMIT);
2138                break;
2139            }
2140            U_ASSERT(minML <= maxML);
2141
2142            // Insert the min and max match len bounds into the URX_LB_CONT op that
2143            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2144            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2145            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2146
2147        }
2148        break;
2149
2150
2151
2152    case lookBehindN:
2153        {
2154            // See comment at doOpenLookBehindNeg.
2155
2156            // Append the URX_LBN_END to the compiled pattern.
2157            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2158            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2159            int32_t dataLoc  = URX_VAL(startOp);
2160            int32_t op       = URX_BUILD(URX_LBN_END, dataLoc);
2161            fRXPat->fCompiledPat->addElement(op, *fStatus);
2162
2163            // Determine the min and max bounds for the length of the
2164            //  string that the pattern can match.
2165            //  An unbounded upper limit is an error.
2166            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2167            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2168            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2169            if (maxML == INT32_MAX) {
2170                error(U_REGEX_LOOK_BEHIND_LIMIT);
2171                break;
2172            }
2173            U_ASSERT(minML <= maxML);
2174
2175            // Insert the min and max match len bounds into the URX_LB_CONT op that
2176            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2177            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2178            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2179
2180            // Insert the pattern location to continue at after a successful match
2181            //  as the last operand of the URX_LBN_CONT
2182            op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2183            fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2184        }
2185        break;
2186
2187
2188
2189    default:
2190        U_ASSERT(FALSE);
2191    }
2192
2193    // remember the next location in the compiled pattern.
2194    // The compilation of Quantifiers will look at this to see whether its looping
2195    //   over a parenthesized block or a single item
2196    fMatchCloseParen = fRXPat->fCompiledPat->size();
2197}
2198
2199
2200
2201//------------------------------------------------------------------------------
2202//
2203//   compileSet       Compile the pattern operations for a reference to a
2204//                    UnicodeSet.
2205//
2206//------------------------------------------------------------------------------
2207void        RegexCompile::compileSet(UnicodeSet *theSet)
2208{
2209    if (theSet == NULL) {
2210        return;
2211    }
2212    //  Remove any strings from the set.
2213    //  There shoudn't be any, but just in case.
2214    //     (Case Closure can add them; if we had a simple case closure avaialble that
2215    //      ignored strings, that would be better.)
2216    theSet->removeAllStrings();
2217    int32_t  setSize = theSet->size();
2218
2219    switch (setSize) {
2220    case 0:
2221        {
2222            // Set of no elements.   Always fails to match.
2223            fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
2224            delete theSet;
2225        }
2226        break;
2227
2228    case 1:
2229        {
2230            // The set contains only a single code point.  Put it into
2231            //   the compiled pattern as a single char operation rather
2232            //   than a set, and discard the set itself.
2233            literalChar(theSet->charAt(0));
2234            delete theSet;
2235        }
2236        break;
2237
2238    default:
2239        {
2240            //  The set contains two or more chars.  (the normal case)
2241            //  Put it into the compiled pattern as a set.
2242            int32_t setNumber = fRXPat->fSets->size();
2243            fRXPat->fSets->addElement(theSet, *fStatus);
2244            int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
2245            fRXPat->fCompiledPat->addElement(setOp, *fStatus);
2246        }
2247    }
2248}
2249
2250
2251//------------------------------------------------------------------------------
2252//
2253//   compileInterval    Generate the code for a {min, max} style interval quantifier.
2254//                      Except for the specific opcodes used, the code is the same
2255//                      for all three types (greedy, non-greedy, possessive) of
2256//                      intervals.  The opcodes are supplied as parameters.
2257//                      (There are two sets of opcodes - greedy & possessive use the
2258//                      same ones, while non-greedy has it's own.)
2259//
2260//                      The code for interval loops has this form:
2261//                         0  CTR_INIT   counter loc (in stack frame)
2262//                         1             5  patt address of CTR_LOOP at bottom of block
2263//                         2             min count
2264//                         3             max count   (-1 for unbounded)
2265//                         4  ...        block to be iterated over
2266//                         5  CTR_LOOP
2267//
2268//                       In
2269//------------------------------------------------------------------------------
2270void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2271{
2272    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2273    //   four slots in the compiled code.  Reserve them.
2274    int32_t   topOfBlock = blockTopLoc(TRUE);
2275    insertOp(topOfBlock);
2276    insertOp(topOfBlock);
2277    insertOp(topOfBlock);
2278
2279    // The operands for the CTR_INIT opcode include the index in the matcher data
2280    //   of the counter.  Allocate it now. There are two data items
2281    //        counterLoc   -->  Loop counter
2282    //               +1    -->  Input index (for breaking non-progressing loops)
2283    //                          (Only present if unbounded upper limit on loop)
2284    int32_t   counterLoc = fRXPat->fFrameSize;
2285    fRXPat->fFrameSize++;
2286    if (fIntervalUpper < 0) {
2287        fRXPat->fFrameSize++;
2288    }
2289
2290    int32_t   op = URX_BUILD(InitOp, counterLoc);
2291    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2292
2293    // The second operand of CTR_INIT is the location following the end of the loop.
2294    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2295    //   compilation of something later on causes the code to grow and the target
2296    //   position to move.
2297    int32_t loopEnd = fRXPat->fCompiledPat->size();
2298    op = URX_BUILD(URX_RELOC_OPRND, loopEnd);
2299    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2300
2301    // Followed by the min and max counts.
2302    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2303    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2304
2305    // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2306    //   Goes at end of the block being looped over, so just append to the code so far.
2307    op = URX_BUILD(LoopOp, topOfBlock);
2308    fRXPat->fCompiledPat->addElement(op, *fStatus);
2309
2310    if ((fIntervalLow & 0xff000000) != 0 ||
2311        (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2312            error(U_REGEX_NUMBER_TOO_BIG);
2313        }
2314
2315    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2316        error(U_REGEX_MAX_LT_MIN);
2317    }
2318}
2319
2320
2321
2322UBool RegexCompile::compileInlineInterval() {
2323    if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2324        // Too big to inline.  Fail, which will cause looping code to be generated.
2325        //   (Upper < Lower picks up unbounded upper and errors, both.)
2326        return FALSE;
2327    }
2328
2329    int32_t   topOfBlock = blockTopLoc(FALSE);
2330    if (fIntervalUpper == 0) {
2331        // Pathological case.  Attempt no matches, as if the block doesn't exist.
2332        fRXPat->fCompiledPat->setSize(topOfBlock);
2333        return TRUE;
2334    }
2335
2336    if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2337        // The thing being repeated is not a single op, but some
2338        //   more complex block.  Do it as a loop, not inlines.
2339        //   Note that things "repeated" a max of once are handled as inline, because
2340        //     the one copy of the code already generated is just fine.
2341        return FALSE;
2342    }
2343
2344    // Pick up the opcode that is to be repeated
2345    //
2346    int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2347
2348    // Compute the pattern location where the inline sequence
2349    //   will end, and set up the state save op that will be needed.
2350    //
2351    int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2352                                + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2353    int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
2354    if (fIntervalLow == 0) {
2355        insertOp(topOfBlock);
2356        fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2357    }
2358
2359
2360
2361    //  Loop, emitting the op for the thing being repeated each time.
2362    //    Loop starts at 1 because one instance of the op already exists in the pattern,
2363    //    it was put there when it was originally encountered.
2364    int32_t i;
2365    for (i=1; i<fIntervalUpper; i++ ) {
2366        if (i == fIntervalLow) {
2367            fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2368        }
2369        if (i > fIntervalLow) {
2370            fRXPat->fCompiledPat->addElement(saveOp, *fStatus);
2371        }
2372        fRXPat->fCompiledPat->addElement(op, *fStatus);
2373    }
2374    return TRUE;
2375}
2376
2377
2378
2379//------------------------------------------------------------------------------
2380//
2381//   matchStartType    Determine how a match can start.
2382//                     Used to optimize find() operations.
2383//
2384//                     Operation is very similar to minMatchLength().  Walk the compiled
2385//                     pattern, keeping an on-going minimum-match-length.  For any
2386//                     op where the min match coming in is zero, add that ops possible
2387//                     starting matches to the possible starts for the overall pattern.
2388//
2389//------------------------------------------------------------------------------
2390void   RegexCompile::matchStartType() {
2391    if (U_FAILURE(*fStatus)) {
2392        return;
2393    }
2394
2395
2396    int32_t    loc;                    // Location in the pattern of the current op being processed.
2397    int32_t    op;                     // The op being processed
2398    int32_t    opType;                 // The opcode type of the op
2399    int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2400    int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2401
2402    UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2403                                       //   could have advanced the position in a match.
2404                                       //   (Maximum match length so far == 0)
2405
2406    // forwardedLength is a vector holding minimum-match-length values that
2407    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2408    //   It must be one longer than the pattern being checked because some  ops
2409    //   will jmp to a end-of-block+1 location from within a block, and we must
2410    //   count those when checking the block.
2411    int32_t end = fRXPat->fCompiledPat->size();
2412    UVector32  forwardedLength(end+1, *fStatus);
2413    forwardedLength.setSize(end+1);
2414    for (loc=3; loc<end; loc++) {
2415        forwardedLength.setElementAt(INT32_MAX, loc);
2416    }
2417
2418    for (loc = 3; loc<end; loc++) {
2419        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2420        opType = URX_TYPE(op);
2421
2422        // The loop is advancing linearly through the pattern.
2423        // If the op we are now at was the destination of a branch in the pattern,
2424        // and that path has a shorter minimum length than the current accumulated value,
2425        // replace the current accumulated value.
2426        if (forwardedLength.elementAti(loc) < currentLen) {
2427            currentLen = forwardedLength.elementAti(loc);
2428            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2429        }
2430
2431        switch (opType) {
2432            // Ops that don't change the total length matched
2433        case URX_RESERVED_OP:
2434        case URX_END:
2435        case URX_FAIL:
2436        case URX_STRING_LEN:
2437        case URX_NOP:
2438        case URX_START_CAPTURE:
2439        case URX_END_CAPTURE:
2440        case URX_BACKSLASH_B:
2441        case URX_BACKSLASH_BU:
2442        case URX_BACKSLASH_G:
2443        case URX_BACKSLASH_Z:
2444        case URX_DOLLAR:
2445        case URX_DOLLAR_M:
2446        case URX_DOLLAR_D:
2447        case URX_DOLLAR_MD:
2448        case URX_RELOC_OPRND:
2449        case URX_STO_INP_LOC:
2450        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2451        case URX_BACKREF_I:
2452
2453        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2454        case URX_LD_SP:
2455            break;
2456
2457        case URX_CARET:
2458            if (atStart) {
2459                fRXPat->fStartType = START_START;
2460            }
2461            break;
2462
2463        case URX_CARET_M:
2464        case URX_CARET_M_UNIX:
2465            if (atStart) {
2466                fRXPat->fStartType = START_LINE;
2467            }
2468            break;
2469
2470        case URX_ONECHAR:
2471            if (currentLen == 0) {
2472                // This character could appear at the start of a match.
2473                //   Add it to the set of possible starting characters.
2474                fRXPat->fInitialChars->add(URX_VAL(op));
2475                numInitialStrings += 2;
2476            }
2477            currentLen++;
2478            atStart = FALSE;
2479            break;
2480
2481
2482        case URX_SETREF:
2483            if (currentLen == 0) {
2484                int32_t  sn = URX_VAL(op);
2485                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2486                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2487                fRXPat->fInitialChars->addAll(*s);
2488                numInitialStrings += 2;
2489            }
2490            currentLen++;
2491            atStart = FALSE;
2492            break;
2493
2494        case URX_LOOP_SR_I:
2495            // [Set]*, like a SETREF, above, in what it can match,
2496            //  but may not match at all, so currentLen is not incremented.
2497            if (currentLen == 0) {
2498                int32_t  sn = URX_VAL(op);
2499                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2500                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2501                fRXPat->fInitialChars->addAll(*s);
2502                numInitialStrings += 2;
2503            }
2504            atStart = FALSE;
2505            break;
2506
2507        case URX_LOOP_DOT_I:
2508            if (currentLen == 0) {
2509                // .* at the start of a pattern.
2510                //    Any character can begin the match.
2511                fRXPat->fInitialChars->clear();
2512                fRXPat->fInitialChars->complement();
2513                numInitialStrings += 2;
2514            }
2515            atStart = FALSE;
2516            break;
2517
2518
2519        case URX_STATIC_SETREF:
2520            if (currentLen == 0) {
2521                int32_t  sn = URX_VAL(op);
2522                U_ASSERT(sn>0 && sn<URX_LAST_SET);
2523                const UnicodeSet *s = fRXPat->fStaticSets[sn];
2524                fRXPat->fInitialChars->addAll(*s);
2525                numInitialStrings += 2;
2526            }
2527            currentLen++;
2528            atStart = FALSE;
2529            break;
2530
2531
2532
2533        case URX_STAT_SETREF_N:
2534            if (currentLen == 0) {
2535                int32_t  sn = URX_VAL(op);
2536                const UnicodeSet *s = fRXPat->fStaticSets[sn];
2537                UnicodeSet sc(*s);
2538                sc.complement();
2539                fRXPat->fInitialChars->addAll(sc);
2540                numInitialStrings += 2;
2541            }
2542            currentLen++;
2543            atStart = FALSE;
2544            break;
2545
2546
2547
2548        case URX_BACKSLASH_D:
2549            // Digit Char
2550             if (currentLen == 0) {
2551                 UnicodeSet s;
2552                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2553                 if (URX_VAL(op) != 0) {
2554                     s.complement();
2555                 }
2556                 fRXPat->fInitialChars->addAll(s);
2557                 numInitialStrings += 2;
2558            }
2559            currentLen++;
2560            atStart = FALSE;
2561            break;
2562
2563
2564        case URX_ONECHAR_I:
2565            // Case Insensitive Single Character.
2566            if (currentLen == 0) {
2567                UChar32  c = URX_VAL(op);
2568                if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2569
2570                    // Disable optimizations on first char of match.
2571                    // TODO: Compute the set of chars that case fold to this char, or to
2572                    //       a string that begins with this char.
2573                    //       For simple case folding, this code worked:
2574                    //   UnicodeSet s(c, c);
2575                    //   s.closeOver(USET_CASE_INSENSITIVE);
2576                    //   fRXPat->fInitialChars->addAll(s);
2577
2578                    fRXPat->fInitialChars->clear();
2579                    fRXPat->fInitialChars->complement();
2580                } else {
2581                    // Char has no case variants.  Just add it as-is to the
2582                    //   set of possible starting chars.
2583                    fRXPat->fInitialChars->add(c);
2584                }
2585                numInitialStrings += 2;
2586            }
2587            currentLen++;
2588            atStart = FALSE;
2589            break;
2590
2591
2592        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2593        case URX_DOTANY_ALL:    // . matches one or two.
2594        case URX_DOTANY:
2595        case URX_DOTANY_UNIX:
2596            if (currentLen == 0) {
2597                // These constructs are all bad news when they appear at the start
2598                //   of a match.  Any character can begin the match.
2599                fRXPat->fInitialChars->clear();
2600                fRXPat->fInitialChars->complement();
2601                numInitialStrings += 2;
2602            }
2603            currentLen++;
2604            atStart = FALSE;
2605            break;
2606
2607
2608        case URX_JMPX:
2609            loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2610        case URX_JMP:
2611            {
2612                int32_t  jmpDest = URX_VAL(op);
2613                if (jmpDest < loc) {
2614                    // Loop of some kind.  Can safely ignore, the worst that will happen
2615                    //  is that we understate the true minimum length
2616                    currentLen = forwardedLength.elementAti(loc+1);
2617
2618                } else {
2619                    // Forward jump.  Propagate the current min length to the target loc of the jump.
2620                    U_ASSERT(jmpDest <= end+1);
2621                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
2622                        forwardedLength.setElementAt(currentLen, jmpDest);
2623                    }
2624                }
2625            }
2626            atStart = FALSE;
2627            break;
2628
2629        case URX_JMP_SAV:
2630        case URX_JMP_SAV_X:
2631            // Combo of state save to the next loc, + jmp backwards.
2632            //   Net effect on min. length computation is nothing.
2633            atStart = FALSE;
2634            break;
2635
2636        case URX_BACKTRACK:
2637            // Fails are kind of like a branch, except that the min length was
2638            //   propagated already, by the state save.
2639            currentLen = forwardedLength.elementAti(loc+1);
2640            atStart = FALSE;
2641            break;
2642
2643
2644        case URX_STATE_SAVE:
2645            {
2646                // State Save, for forward jumps, propagate the current minimum.
2647                //             of the state save.
2648                int32_t  jmpDest = URX_VAL(op);
2649                if (jmpDest > loc) {
2650                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
2651                        forwardedLength.setElementAt(currentLen, jmpDest);
2652                    }
2653                }
2654            }
2655            atStart = FALSE;
2656            break;
2657
2658
2659
2660
2661        case URX_STRING:
2662            {
2663                loc++;
2664                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2665                int32_t stringLen   = URX_VAL(stringLenOp);
2666                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2667                U_ASSERT(stringLenOp >= 2);
2668                if (currentLen == 0) {
2669                    // Add the starting character of this string to the set of possible starting
2670                    //   characters for this pattern.
2671                    int32_t stringStartIdx = URX_VAL(op);
2672                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2673                    fRXPat->fInitialChars->add(c);
2674
2675                    // Remember this string.  After the entire pattern has been checked,
2676                    //  if nothing else is identified that can start a match, we'll use it.
2677                    numInitialStrings++;
2678                    fRXPat->fInitialStringIdx = stringStartIdx;
2679                    fRXPat->fInitialStringLen = stringLen;
2680                }
2681
2682                currentLen += stringLen;
2683                atStart = FALSE;
2684            }
2685            break;
2686
2687        case URX_STRING_I:
2688            {
2689                // Case-insensitive string.  Unlike exact-match strings, we won't
2690                //   attempt a string search for possible match positions.  But we
2691                //   do update the set of possible starting characters.
2692                loc++;
2693                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2694                int32_t stringLen   = URX_VAL(stringLenOp);
2695                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2696                U_ASSERT(stringLenOp >= 2);
2697                if (currentLen == 0) {
2698                    // Add the starting character of this string to the set of possible starting
2699                    //   characters for this pattern.
2700                    int32_t stringStartIdx = URX_VAL(op);
2701                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2702                    UnicodeSet s(c, c);
2703
2704                    // TODO:  compute correct set of starting chars for full case folding.
2705                    //        For the moment, say any char can start.
2706                    // s.closeOver(USET_CASE_INSENSITIVE);
2707                    s.clear();
2708                    s.complement();
2709
2710                    fRXPat->fInitialChars->addAll(s);
2711                    numInitialStrings += 2;  // Matching on an initial string not possible.
2712                }
2713                currentLen += stringLen;
2714                atStart = FALSE;
2715            }
2716            break;
2717
2718        case URX_CTR_INIT:
2719        case URX_CTR_INIT_NG:
2720            {
2721                // Loop Init Ops.  These don't change the min length, but they are 4 word ops
2722                //   so location must be updated accordingly.
2723                // Loop Init Ops.
2724                //   If the min loop count == 0
2725                //      move loc forwards to the end of the loop, skipping over the body.
2726                //   If the min count is > 0,
2727                //      continue normal processing of the body of the loop.
2728                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
2729                        loopEndLoc   = URX_VAL(loopEndLoc);
2730                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
2731                if (minLoopCount == 0) {
2732                    // Min Loop Count of 0, treat like a forward branch and
2733                    //   move the current minimum length up to the target
2734                    //   (end of loop) location.
2735                    U_ASSERT(loopEndLoc <= end+1);
2736                    if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
2737                        forwardedLength.setElementAt(currentLen, loopEndLoc);
2738                    }
2739                }
2740                loc+=3;  // Skips over operands of CTR_INIT
2741            }
2742            atStart = FALSE;
2743            break;
2744
2745
2746        case URX_CTR_LOOP:
2747        case URX_CTR_LOOP_NG:
2748            // Loop ops.
2749            //  The jump is conditional, backwards only.
2750            atStart = FALSE;
2751            break;
2752
2753        case URX_LOOP_C:
2754            // More loop ops.  These state-save to themselves.
2755            //   don't change the minimum match
2756            atStart = FALSE;
2757            break;
2758
2759
2760        case URX_LA_START:
2761        case URX_LB_START:
2762            {
2763                // Look-around.  Scan forward until the matching look-ahead end,
2764                //   without processing the look-around block.  This is overly pessimistic.
2765
2766                // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
2767                //   lookahead contains two LA_END instructions, so count goes up by two
2768                //   for each LA_START.
2769                int32_t  depth = (opType == URX_LA_START? 2: 1);
2770                for (;;) {
2771                    loc++;
2772                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2773                    if (URX_TYPE(op) == URX_LA_START) {
2774                        depth+=2;
2775                    }
2776                    if (URX_TYPE(op) == URX_LB_START) {
2777                        depth++;
2778                    }
2779                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
2780                        depth--;
2781                        if (depth == 0) {
2782                            break;
2783                        }
2784                    }
2785                    if (URX_TYPE(op) == URX_STATE_SAVE) {
2786                        // Need this because neg lookahead blocks will FAIL to outside
2787                        //   of the block.
2788                        int32_t  jmpDest = URX_VAL(op);
2789                        if (jmpDest > loc) {
2790                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
2791                                forwardedLength.setElementAt(currentLen, jmpDest);
2792                            }
2793                        }
2794                    }
2795                    U_ASSERT(loc <= end);
2796                }
2797            }
2798            break;
2799
2800        case URX_LA_END:
2801        case URX_LB_CONT:
2802        case URX_LB_END:
2803        case URX_LBN_CONT:
2804        case URX_LBN_END:
2805            U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
2806                                 //  consumed by the scan in URX_LA_START and LB_START
2807
2808            break;
2809
2810        default:
2811            U_ASSERT(FALSE);
2812            }
2813
2814        }
2815
2816
2817    // We have finished walking through the ops.  Check whether some forward jump
2818    //   propagated a shorter length to location end+1.
2819    if (forwardedLength.elementAti(end+1) < currentLen) {
2820        currentLen = forwardedLength.elementAti(end+1);
2821    }
2822
2823
2824    fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
2825
2826
2827    // Sort out what we should check for when looking for candidate match start positions.
2828    // In order of preference,
2829    //     1.   Start of input text buffer.
2830    //     2.   A literal string.
2831    //     3.   Start of line in multi-line mode.
2832    //     4.   A single literal character.
2833    //     5.   A character from a set of characters.
2834    //
2835    if (fRXPat->fStartType == START_START) {
2836        // Match only at the start of an input text string.
2837        //    start type is already set.  We're done.
2838    } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
2839        // Match beginning only with a literal string.
2840        UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
2841        U_ASSERT(fRXPat->fInitialChars->contains(c));
2842        fRXPat->fStartType   = START_STRING;
2843        fRXPat->fInitialChar = c;
2844    } else if (fRXPat->fStartType == START_LINE) {
2845        // Match at start of line in Multi-Line mode.
2846        // Nothing to do here; everything is already set.
2847    } else if (fRXPat->fMinMatchLen == 0) {
2848        // Zero length match possible.  We could start anywhere.
2849        fRXPat->fStartType = START_NO_INFO;
2850    } else if (fRXPat->fInitialChars->size() == 1) {
2851        // All matches begin with the same char.
2852        fRXPat->fStartType   = START_CHAR;
2853        fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
2854        U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
2855    } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
2856        fRXPat->fMinMatchLen > 0) {
2857        // Matches start with a set of character smaller than the set of all chars.
2858        fRXPat->fStartType = START_SET;
2859    } else {
2860        // Matches can start with anything
2861        fRXPat->fStartType = START_NO_INFO;
2862    }
2863
2864    return;
2865}
2866
2867
2868
2869//------------------------------------------------------------------------------
2870//
2871//   minMatchLength    Calculate the length of the shortest string that could
2872//                     match the specified pattern.
2873//                     Length is in 16 bit code units, not code points.
2874//
2875//                     The calculated length may not be exact.  The returned
2876//                     value may be shorter than the actual minimum; it must
2877//                     never be longer.
2878//
2879//                     start and end are the range of p-code operations to be
2880//                     examined.  The endpoints are included in the range.
2881//
2882//------------------------------------------------------------------------------
2883int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
2884    if (U_FAILURE(*fStatus)) {
2885        return 0;
2886    }
2887
2888    U_ASSERT(start <= end);
2889    U_ASSERT(end < fRXPat->fCompiledPat->size());
2890
2891
2892    int32_t    loc;
2893    int32_t    op;
2894    int32_t    opType;
2895    int32_t    currentLen = 0;
2896
2897
2898    // forwardedLength is a vector holding minimum-match-length values that
2899    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2900    //   It must be one longer than the pattern being checked because some  ops
2901    //   will jmp to a end-of-block+1 location from within a block, and we must
2902    //   count those when checking the block.
2903    UVector32  forwardedLength(end+2, *fStatus);
2904    forwardedLength.setSize(end+2);
2905    for (loc=start; loc<=end+1; loc++) {
2906        forwardedLength.setElementAt(INT32_MAX, loc);
2907    }
2908
2909    for (loc = start; loc<=end; loc++) {
2910        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2911        opType = URX_TYPE(op);
2912
2913        // The loop is advancing linearly through the pattern.
2914        // If the op we are now at was the destination of a branch in the pattern,
2915        // and that path has a shorter minimum length than the current accumulated value,
2916        // replace the current accumulated value.
2917        // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
2918                                                               //   no-match-possible cases.
2919        if (forwardedLength.elementAti(loc) < currentLen) {
2920            currentLen = forwardedLength.elementAti(loc);
2921            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2922        }
2923
2924        switch (opType) {
2925            // Ops that don't change the total length matched
2926        case URX_RESERVED_OP:
2927        case URX_END:
2928        case URX_STRING_LEN:
2929        case URX_NOP:
2930        case URX_START_CAPTURE:
2931        case URX_END_CAPTURE:
2932        case URX_BACKSLASH_B:
2933        case URX_BACKSLASH_BU:
2934        case URX_BACKSLASH_G:
2935        case URX_BACKSLASH_Z:
2936        case URX_CARET:
2937        case URX_DOLLAR:
2938        case URX_DOLLAR_M:
2939        case URX_DOLLAR_D:
2940        case URX_DOLLAR_MD:
2941        case URX_RELOC_OPRND:
2942        case URX_STO_INP_LOC:
2943        case URX_CARET_M:
2944        case URX_CARET_M_UNIX:
2945        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2946        case URX_BACKREF_I:
2947
2948        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2949        case URX_LD_SP:
2950
2951        case URX_JMP_SAV:
2952        case URX_JMP_SAV_X:
2953            break;
2954
2955
2956            // Ops that match a minimum of one character (one or two 16 bit code units.)
2957            //
2958        case URX_ONECHAR:
2959        case URX_STATIC_SETREF:
2960        case URX_STAT_SETREF_N:
2961        case URX_SETREF:
2962        case URX_BACKSLASH_D:
2963        case URX_ONECHAR_I:
2964        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2965        case URX_DOTANY_ALL:    // . matches one or two.
2966        case URX_DOTANY:
2967        case URX_DOTANY_UNIX:
2968            currentLen++;
2969            break;
2970
2971
2972        case URX_JMPX:
2973            loc++;              // URX_JMPX has an extra operand, ignored here,
2974                                //   otherwise processed identically to URX_JMP.
2975        case URX_JMP:
2976            {
2977                int32_t  jmpDest = URX_VAL(op);
2978                if (jmpDest < loc) {
2979                    // Loop of some kind.  Can safely ignore, the worst that will happen
2980                    //  is that we understate the true minimum length
2981                    currentLen = forwardedLength.elementAti(loc+1);
2982                } else {
2983                    // Forward jump.  Propagate the current min length to the target loc of the jump.
2984                    U_ASSERT(jmpDest <= end+1);
2985                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
2986                        forwardedLength.setElementAt(currentLen, jmpDest);
2987                    }
2988                }
2989            }
2990            break;
2991
2992        case URX_BACKTRACK:
2993            {
2994                // Back-tracks are kind of like a branch, except that the min length was
2995                //   propagated already, by the state save.
2996                currentLen = forwardedLength.elementAti(loc+1);
2997            }
2998            break;
2999
3000
3001        case URX_STATE_SAVE:
3002            {
3003                // State Save, for forward jumps, propagate the current minimum.
3004                //             of the state save.
3005                int32_t  jmpDest = URX_VAL(op);
3006                if (jmpDest > loc) {
3007                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
3008                        forwardedLength.setElementAt(currentLen, jmpDest);
3009                    }
3010                }
3011            }
3012            break;
3013
3014
3015        case URX_STRING:
3016            {
3017                loc++;
3018                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3019                currentLen += URX_VAL(stringLenOp);
3020            }
3021            break;
3022
3023
3024        case URX_STRING_I:
3025            {
3026                loc++;
3027                // TODO: with full case folding, matching input text may be shorter than
3028                //       the string we have here.  More smarts could put some bounds on it.
3029                //       Assume a min length of one for now.  A min length of zero causes
3030                //        optimization failures for a pattern like "string"+
3031                // currentLen += URX_VAL(stringLenOp);
3032                currentLen += 1;
3033            }
3034            break;
3035
3036        case URX_CTR_INIT:
3037        case URX_CTR_INIT_NG:
3038            {
3039                // Loop Init Ops.
3040                //   If the min loop count == 0
3041                //      move loc forwards to the end of the loop, skipping over the body.
3042                //   If the min count is > 0,
3043                //      continue normal processing of the body of the loop.
3044                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3045                        loopEndLoc   = URX_VAL(loopEndLoc);
3046                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3047                if (minLoopCount == 0) {
3048                    loc = loopEndLoc;
3049                } else {
3050                    loc+=3;  // Skips over operands of CTR_INIT
3051                }
3052            }
3053            break;
3054
3055
3056        case URX_CTR_LOOP:
3057        case URX_CTR_LOOP_NG:
3058            // Loop ops.
3059            //  The jump is conditional, backwards only.
3060            break;
3061
3062        case URX_LOOP_SR_I:
3063        case URX_LOOP_DOT_I:
3064        case URX_LOOP_C:
3065            // More loop ops.  These state-save to themselves.
3066            //   don't change the minimum match - could match nothing at all.
3067            break;
3068
3069
3070        case URX_LA_START:
3071        case URX_LB_START:
3072            {
3073                // Look-around.  Scan forward until the matching look-ahead end,
3074                //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3075                //   it assumes that the look-ahead match might be zero-length.
3076                //   TODO:  Positive lookahead could recursively do the block, then continue
3077                //          with the longer of the block or the value coming in.  Ticket 6060
3078                int32_t  depth = (opType == URX_LA_START? 2: 1);;
3079                for (;;) {
3080                    loc++;
3081                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3082                    if (URX_TYPE(op) == URX_LA_START) {
3083                        // The boilerplate for look-ahead includes two LA_END insturctions,
3084                        //    Depth will be decremented by each one when it is seen.
3085                        depth += 2;
3086                    }
3087                    if (URX_TYPE(op) == URX_LB_START) {
3088                        depth++;
3089                    }
3090                    if (URX_TYPE(op) == URX_LA_END) {
3091                        depth--;
3092                        if (depth == 0) {
3093                            break;
3094                        }
3095                    }
3096                    if (URX_TYPE(op)==URX_LBN_END) {
3097                        depth--;
3098                        if (depth == 0) {
3099                            break;
3100                        }
3101                    }
3102                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3103                        // Need this because neg lookahead blocks will FAIL to outside
3104                        //   of the block.
3105                        int32_t  jmpDest = URX_VAL(op);
3106                        if (jmpDest > loc) {
3107                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3108                                forwardedLength.setElementAt(currentLen, jmpDest);
3109                            }
3110                        }
3111                    }
3112                    U_ASSERT(loc <= end);
3113                }
3114            }
3115            break;
3116
3117        case URX_LA_END:
3118        case URX_LB_CONT:
3119        case URX_LB_END:
3120        case URX_LBN_CONT:
3121        case URX_LBN_END:
3122            // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3123            //   range being sized, which happens when measuring size of look-behind blocks.
3124            break;
3125
3126        default:
3127            U_ASSERT(FALSE);
3128            }
3129
3130        }
3131
3132    // We have finished walking through the ops.  Check whether some forward jump
3133    //   propagated a shorter length to location end+1.
3134    if (forwardedLength.elementAti(end+1) < currentLen) {
3135        currentLen = forwardedLength.elementAti(end+1);
3136        U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3137    }
3138
3139    return currentLen;
3140}
3141
3142// Increment with overflow check.
3143// val and delta will both be positive.
3144
3145static int32_t safeIncrement(int32_t val, int32_t delta) {
3146    if (INT32_MAX - val > delta) {
3147        return val + delta;
3148    } else {
3149        return INT32_MAX;
3150    }
3151}
3152
3153
3154//------------------------------------------------------------------------------
3155//
3156//   maxMatchLength    Calculate the length of the longest string that could
3157//                     match the specified pattern.
3158//                     Length is in 16 bit code units, not code points.
3159//
3160//                     The calculated length may not be exact.  The returned
3161//                     value may be longer than the actual maximum; it must
3162//                     never be shorter.
3163//
3164//------------------------------------------------------------------------------
3165int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3166    if (U_FAILURE(*fStatus)) {
3167        return 0;
3168    }
3169    U_ASSERT(start <= end);
3170    U_ASSERT(end < fRXPat->fCompiledPat->size());
3171
3172
3173    int32_t    loc;
3174    int32_t    op;
3175    int32_t    opType;
3176    int32_t    currentLen = 0;
3177    UVector32  forwardedLength(end+1, *fStatus);
3178    forwardedLength.setSize(end+1);
3179
3180    for (loc=start; loc<=end; loc++) {
3181        forwardedLength.setElementAt(0, loc);
3182    }
3183
3184    for (loc = start; loc<=end; loc++) {
3185        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3186        opType = URX_TYPE(op);
3187
3188        // The loop is advancing linearly through the pattern.
3189        // If the op we are now at was the destination of a branch in the pattern,
3190        // and that path has a longer maximum length than the current accumulated value,
3191        // replace the current accumulated value.
3192        if (forwardedLength.elementAti(loc) > currentLen) {
3193            currentLen = forwardedLength.elementAti(loc);
3194        }
3195
3196        switch (opType) {
3197            // Ops that don't change the total length matched
3198        case URX_RESERVED_OP:
3199        case URX_END:
3200        case URX_STRING_LEN:
3201        case URX_NOP:
3202        case URX_START_CAPTURE:
3203        case URX_END_CAPTURE:
3204        case URX_BACKSLASH_B:
3205        case URX_BACKSLASH_BU:
3206        case URX_BACKSLASH_G:
3207        case URX_BACKSLASH_Z:
3208        case URX_CARET:
3209        case URX_DOLLAR:
3210        case URX_DOLLAR_M:
3211        case URX_DOLLAR_D:
3212        case URX_DOLLAR_MD:
3213        case URX_RELOC_OPRND:
3214        case URX_STO_INP_LOC:
3215        case URX_CARET_M:
3216        case URX_CARET_M_UNIX:
3217
3218        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3219        case URX_LD_SP:
3220
3221        case URX_LB_END:
3222        case URX_LB_CONT:
3223        case URX_LBN_CONT:
3224        case URX_LBN_END:
3225            break;
3226
3227
3228            // Ops that increase that cause an unbounded increase in the length
3229            //   of a matched string, or that increase it a hard to characterize way.
3230            //   Call the max length unbounded, and stop further checking.
3231        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3232        case URX_BACKREF_I:
3233        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3234            currentLen = INT32_MAX;
3235            break;
3236
3237
3238            // Ops that match a max of one character (possibly two 16 bit code units.)
3239            //
3240        case URX_STATIC_SETREF:
3241        case URX_STAT_SETREF_N:
3242        case URX_SETREF:
3243        case URX_BACKSLASH_D:
3244        case URX_ONECHAR_I:
3245        case URX_DOTANY_ALL:
3246        case URX_DOTANY:
3247        case URX_DOTANY_UNIX:
3248            currentLen = safeIncrement(currentLen, 2);
3249            break;
3250
3251            // Single literal character.  Increase current max length by one or two,
3252            //       depending on whether the char is in the supplementary range.
3253        case URX_ONECHAR:
3254            currentLen = safeIncrement(currentLen, 1);
3255            if (URX_VAL(op) > 0x10000) {
3256                currentLen = safeIncrement(currentLen, 1);
3257            }
3258            break;
3259
3260            // Jumps.
3261            //
3262        case URX_JMP:
3263        case URX_JMPX:
3264        case URX_JMP_SAV:
3265        case URX_JMP_SAV_X:
3266            {
3267                int32_t  jmpDest = URX_VAL(op);
3268                if (jmpDest < loc) {
3269                    // Loop of some kind.  Max match length is unbounded.
3270                    currentLen = INT32_MAX;
3271                } else {
3272                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3273                    if (forwardedLength.elementAti(jmpDest) < currentLen) {
3274                        forwardedLength.setElementAt(currentLen, jmpDest);
3275                    }
3276                    currentLen = 0;
3277                }
3278            }
3279            break;
3280
3281        case URX_BACKTRACK:
3282            // back-tracks are kind of like a branch, except that the max length was
3283            //   propagated already, by the state save.
3284            currentLen = forwardedLength.elementAti(loc+1);
3285            break;
3286
3287
3288        case URX_STATE_SAVE:
3289            {
3290                // State Save, for forward jumps, propagate the current minimum.
3291                //               of the state save.
3292                //             For backwards jumps, they create a loop, maximum
3293                //               match length is unbounded.
3294                int32_t  jmpDest = URX_VAL(op);
3295                if (jmpDest > loc) {
3296                    if (currentLen > forwardedLength.elementAti(jmpDest)) {
3297                        forwardedLength.setElementAt(currentLen, jmpDest);
3298                    }
3299                } else {
3300                    currentLen = INT32_MAX;
3301                }
3302            }
3303            break;
3304
3305
3306
3307
3308        case URX_STRING:
3309            {
3310                loc++;
3311                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3312                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3313                break;
3314            }
3315
3316        case URX_STRING_I:
3317            // TODO:  This code assumes that any user string that matches will be no longer
3318            //        than our compiled string, with case insensitive matching.
3319            //        Our compiled string has been case-folded already.
3320            //
3321            //        Any matching user string will have no more code points than our
3322            //        compiled (folded) string.  Folding may add code points, but
3323            //        not remove them.
3324            //
3325            //        There is a potential problem if a supplemental code point
3326            //        case-folds to a BMP code point.  In this case our compiled string
3327            //        could be shorter (in code units) than a matching user string.
3328            //
3329            //        At this time (Unicode 6.1) there are no such characters, and this case
3330            //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3331            //        any problematic characters are added to Unicode.
3332            //
3333            //        If this happens, we can make a set of the BMP chars that the
3334            //        troublesome supplementals fold to, scan our string, and bump the
3335            //        currentLen one extra for each that is found.
3336            //
3337            {
3338                loc++;
3339                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3340                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3341            }
3342            break;
3343
3344        case URX_CTR_INIT:
3345        case URX_CTR_INIT_NG:
3346            // For Loops, recursively call this function on the pattern for the loop body,
3347            //   then multiply the result by the maximum loop count.
3348            {
3349                int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3350                if (loopEndLoc == loc+4) {
3351                    // Loop has an empty body. No affect on max match length.
3352                    // Continue processing with code after the loop end.
3353                    loc = loopEndLoc;
3354                    break;
3355                }
3356
3357                int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3);
3358                if (maxLoopCount == -1) {
3359                    // Unbounded Loop. No upper bound on match length.
3360                    currentLen = INT32_MAX;
3361                    break;
3362                }
3363
3364                U_ASSERT(loopEndLoc >= loc+4);
3365                int32_t  blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3366                if (blockLen == INT32_MAX) {
3367                    currentLen = blockLen;
3368                    break;
3369                }
3370                currentLen += blockLen * maxLoopCount;
3371                loc = loopEndLoc;
3372                break;
3373            }
3374
3375        case URX_CTR_LOOP:
3376        case URX_CTR_LOOP_NG:
3377            // These opcodes will be skipped over by code for URX_CRT_INIT.
3378            // We shouldn't encounter them here.
3379            U_ASSERT(FALSE);
3380            break;
3381
3382        case URX_LOOP_SR_I:
3383        case URX_LOOP_DOT_I:
3384        case URX_LOOP_C:
3385            // For anything to do with loops, make the match length unbounded.
3386            currentLen = INT32_MAX;
3387            break;
3388
3389
3390
3391        case URX_LA_START:
3392        case URX_LA_END:
3393            // Look-ahead.  Just ignore, treat the look-ahead block as if
3394            // it were normal pattern.  Gives a too-long match length,
3395            //  but good enough for now.
3396            break;
3397
3398            // End of look-ahead ops should always be consumed by the processing at
3399            //  the URX_LA_START op.
3400            // U_ASSERT(FALSE);
3401            // break;
3402
3403        case URX_LB_START:
3404            {
3405                // Look-behind.  Scan forward until the matching look-around end,
3406                //   without processing the look-behind block.
3407                int32_t  depth = 0;
3408                for (;;) {
3409                    loc++;
3410                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3411                    if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3412                        depth++;
3413                    }
3414                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3415                        if (depth == 0) {
3416                            break;
3417                        }
3418                        depth--;
3419                    }
3420                    U_ASSERT(loc < end);
3421                }
3422            }
3423            break;
3424
3425        default:
3426            U_ASSERT(FALSE);
3427        }
3428
3429
3430        if (currentLen == INT32_MAX) {
3431            //  The maximum length is unbounded.
3432            //  Stop further processing of the pattern.
3433            break;
3434        }
3435
3436    }
3437    return currentLen;
3438
3439}
3440
3441
3442//------------------------------------------------------------------------------
3443//
3444//   stripNOPs    Remove any NOP operations from the compiled pattern code.
3445//                Extra NOPs are inserted for some constructs during the initial
3446//                code generation to provide locations that may be patched later.
3447//                Many end up unneeded, and are removed by this function.
3448//
3449//                In order to minimize the number of passes through the pattern,
3450//                back-reference fixup is also performed here (adjusting
3451//                back-reference operands to point to the correct frame offsets).
3452//
3453//------------------------------------------------------------------------------
3454void RegexCompile::stripNOPs() {
3455
3456    if (U_FAILURE(*fStatus)) {
3457        return;
3458    }
3459
3460    int32_t    end = fRXPat->fCompiledPat->size();
3461    UVector32  deltas(end, *fStatus);
3462
3463    // Make a first pass over the code, computing the amount that things
3464    //   will be offset at each location in the original code.
3465    int32_t   loc;
3466    int32_t   d = 0;
3467    for (loc=0; loc<end; loc++) {
3468        deltas.addElement(d, *fStatus);
3469        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3470        if (URX_TYPE(op) == URX_NOP) {
3471            d++;
3472        }
3473    }
3474
3475    UnicodeString caseStringBuffer;
3476
3477    // Make a second pass over the code, removing the NOPs by moving following
3478    //  code up, and patching operands that refer to code locations that
3479    //  are being moved.  The array of offsets from the first step is used
3480    //  to compute the new operand values.
3481    int32_t src;
3482    int32_t dst = 0;
3483    for (src=0; src<end; src++) {
3484        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3485        int32_t opType = URX_TYPE(op);
3486        switch (opType) {
3487        case URX_NOP:
3488            break;
3489
3490        case URX_STATE_SAVE:
3491        case URX_JMP:
3492        case URX_CTR_LOOP:
3493        case URX_CTR_LOOP_NG:
3494        case URX_RELOC_OPRND:
3495        case URX_JMPX:
3496        case URX_JMP_SAV:
3497        case URX_JMP_SAV_X:
3498            // These are instructions with operands that refer to code locations.
3499            {
3500                int32_t  operandAddress = URX_VAL(op);
3501                U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3502                int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3503                op = URX_BUILD(opType, fixedOperandAddress);
3504                fRXPat->fCompiledPat->setElementAt(op, dst);
3505                dst++;
3506                break;
3507            }
3508
3509        case URX_BACKREF:
3510        case URX_BACKREF_I:
3511            {
3512                int32_t where = URX_VAL(op);
3513                if (where > fRXPat->fGroupMap->size()) {
3514                    error(U_REGEX_INVALID_BACK_REF);
3515                    break;
3516                }
3517                where = fRXPat->fGroupMap->elementAti(where-1);
3518                op    = URX_BUILD(opType, where);
3519                fRXPat->fCompiledPat->setElementAt(op, dst);
3520                dst++;
3521
3522                fRXPat->fNeedsAltInput = TRUE;
3523                break;
3524            }
3525        case URX_RESERVED_OP:
3526        case URX_RESERVED_OP_N:
3527        case URX_BACKTRACK:
3528        case URX_END:
3529        case URX_ONECHAR:
3530        case URX_STRING:
3531        case URX_STRING_LEN:
3532        case URX_START_CAPTURE:
3533        case URX_END_CAPTURE:
3534        case URX_STATIC_SETREF:
3535        case URX_STAT_SETREF_N:
3536        case URX_SETREF:
3537        case URX_DOTANY:
3538        case URX_FAIL:
3539        case URX_BACKSLASH_B:
3540        case URX_BACKSLASH_BU:
3541        case URX_BACKSLASH_G:
3542        case URX_BACKSLASH_X:
3543        case URX_BACKSLASH_Z:
3544        case URX_DOTANY_ALL:
3545        case URX_BACKSLASH_D:
3546        case URX_CARET:
3547        case URX_DOLLAR:
3548        case URX_CTR_INIT:
3549        case URX_CTR_INIT_NG:
3550        case URX_DOTANY_UNIX:
3551        case URX_STO_SP:
3552        case URX_LD_SP:
3553        case URX_STO_INP_LOC:
3554        case URX_LA_START:
3555        case URX_LA_END:
3556        case URX_ONECHAR_I:
3557        case URX_STRING_I:
3558        case URX_DOLLAR_M:
3559        case URX_CARET_M:
3560        case URX_CARET_M_UNIX:
3561        case URX_LB_START:
3562        case URX_LB_CONT:
3563        case URX_LB_END:
3564        case URX_LBN_CONT:
3565        case URX_LBN_END:
3566        case URX_LOOP_SR_I:
3567        case URX_LOOP_DOT_I:
3568        case URX_LOOP_C:
3569        case URX_DOLLAR_D:
3570        case URX_DOLLAR_MD:
3571            // These instructions are unaltered by the relocation.
3572            fRXPat->fCompiledPat->setElementAt(op, dst);
3573            dst++;
3574            break;
3575
3576        default:
3577            // Some op is unaccounted for.
3578            U_ASSERT(FALSE);
3579            error(U_REGEX_INTERNAL_ERROR);
3580        }
3581    }
3582
3583    fRXPat->fCompiledPat->setSize(dst);
3584}
3585
3586
3587
3588
3589//------------------------------------------------------------------------------
3590//
3591//  Error         Report a rule parse error.
3592//                Only report it if no previous error has been recorded.
3593//
3594//------------------------------------------------------------------------------
3595void RegexCompile::error(UErrorCode e) {
3596    if (U_SUCCESS(*fStatus)) {
3597        *fStatus = e;
3598        // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3599        // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3600        // int64_t. If the values of the latter are out of range for the former,
3601        // set them to the appropriate "field not supported" values.
3602        if (fLineNum > 0x7FFFFFFF) {
3603            fParseErr->line   = 0;
3604            fParseErr->offset = -1;
3605        } else if (fCharNum > 0x7FFFFFFF) {
3606            fParseErr->line   = (int32_t)fLineNum;
3607            fParseErr->offset = -1;
3608        } else {
3609            fParseErr->line   = (int32_t)fLineNum;
3610            fParseErr->offset = (int32_t)fCharNum;
3611        }
3612
3613        UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3614
3615        // Fill in the context.
3616        //   Note: extractBetween() pins supplied indicies to the string bounds.
3617        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3618        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3619        utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3620        utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3621    }
3622}
3623
3624
3625//
3626//  Assorted Unicode character constants.
3627//     Numeric because there is no portable way to enter them as literals.
3628//     (Think EBCDIC).
3629//
3630static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3631static const UChar      chLF        = 0x0a;      // Line Feed
3632static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3633static const UChar      chDigit0    = 0x30;      // '0'
3634static const UChar      chDigit7    = 0x37;      // '9'
3635static const UChar      chColon     = 0x3A;      // ':'
3636static const UChar      chE         = 0x45;      // 'E'
3637static const UChar      chQ         = 0x51;      // 'Q'
3638//static const UChar      chN         = 0x4E;      // 'N'
3639static const UChar      chP         = 0x50;      // 'P'
3640static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3641//static const UChar      chLBracket  = 0x5b;      // '['
3642static const UChar      chRBracket  = 0x5d;      // ']'
3643static const UChar      chUp        = 0x5e;      // '^'
3644static const UChar      chLowerP    = 0x70;
3645static const UChar      chLBrace    = 0x7b;      // '{'
3646static const UChar      chRBrace    = 0x7d;      // '}'
3647static const UChar      chNEL       = 0x85;      //    NEL newline variant
3648static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3649
3650
3651//------------------------------------------------------------------------------
3652//
3653//  nextCharLL    Low Level Next Char from the regex pattern.
3654//                Get a char from the string, keep track of input position
3655//                     for error reporting.
3656//
3657//------------------------------------------------------------------------------
3658UChar32  RegexCompile::nextCharLL() {
3659    UChar32       ch;
3660
3661    if (fPeekChar != -1) {
3662        ch = fPeekChar;
3663        fPeekChar = -1;
3664        return ch;
3665    }
3666
3667    // assume we're already in the right place
3668    ch = UTEXT_NEXT32(fRXPat->fPattern);
3669    if (ch == U_SENTINEL) {
3670        return ch;
3671    }
3672
3673    if (ch == chCR ||
3674        ch == chNEL ||
3675        ch == chLS   ||
3676        (ch == chLF && fLastChar != chCR)) {
3677        // Character is starting a new line.  Bump up the line number, and
3678        //  reset the column to 0.
3679        fLineNum++;
3680        fCharNum=0;
3681    }
3682    else {
3683        // Character is not starting a new line.  Except in the case of a
3684        //   LF following a CR, increment the column position.
3685        if (ch != chLF) {
3686            fCharNum++;
3687        }
3688    }
3689    fLastChar = ch;
3690    return ch;
3691}
3692
3693//------------------------------------------------------------------------------
3694//
3695//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
3696//                 character without actually getting it.
3697//
3698//------------------------------------------------------------------------------
3699UChar32  RegexCompile::peekCharLL() {
3700    if (fPeekChar == -1) {
3701        fPeekChar = nextCharLL();
3702    }
3703    return fPeekChar;
3704}
3705
3706
3707//------------------------------------------------------------------------------
3708//
3709//   nextChar     for pattern scanning.  At this level, we handle stripping
3710//                out comments and processing some backslash character escapes.
3711//                The rest of the pattern grammar is handled at the next level up.
3712//
3713//------------------------------------------------------------------------------
3714void RegexCompile::nextChar(RegexPatternChar &c) {
3715
3716    fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
3717    c.fChar    = nextCharLL();
3718    c.fQuoted  = FALSE;
3719
3720    if (fQuoteMode) {
3721        c.fQuoted = TRUE;
3722        if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
3723            c.fChar == (UChar32)-1) {
3724            fQuoteMode = FALSE;  //  Exit quote mode,
3725            nextCharLL();        // discard the E
3726            nextChar(c);         // recurse to get the real next char
3727        }
3728    }
3729    else if (fInBackslashQuote) {
3730        // The current character immediately follows a '\'
3731        // Don't check for any further escapes, just return it as-is.
3732        // Don't set c.fQuoted, because that would prevent the state machine from
3733        //    dispatching on the character.
3734        fInBackslashQuote = FALSE;
3735    }
3736    else
3737    {
3738        // We are not in a \Q quoted region \E of the source.
3739        //
3740        if (fModeFlags & UREGEX_COMMENTS) {
3741            //
3742            // We are in free-spacing and comments mode.
3743            //  Scan through any white space and comments, until we
3744            //  reach a significant character or the end of inut.
3745            for (;;) {
3746                if (c.fChar == (UChar32)-1) {
3747                    break;     // End of Input
3748                }
3749                if  (c.fChar == chPound && fEOLComments == TRUE) {
3750                    // Start of a comment.  Consume the rest of it, until EOF or a new line
3751                    for (;;) {
3752                        c.fChar = nextCharLL();
3753                        if (c.fChar == (UChar32)-1 ||  // EOF
3754                            c.fChar == chCR        ||
3755                            c.fChar == chLF        ||
3756                            c.fChar == chNEL       ||
3757                            c.fChar == chLS)       {
3758                            break;
3759                        }
3760                    }
3761                }
3762                // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
3763                if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
3764                    break;
3765                }
3766                c.fChar = nextCharLL();
3767            }
3768        }
3769
3770        //
3771        //  check for backslash escaped characters.
3772        //
3773        if (c.fChar == chBackSlash) {
3774            int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
3775            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
3776                //
3777                // A '\' sequence that is handled by ICU's standard unescapeAt function.
3778                //   Includes \uxxxx, \n, \r, many others.
3779                //   Return the single equivalent character.
3780                //
3781                nextCharLL();                 // get & discard the peeked char.
3782                c.fQuoted = TRUE;
3783
3784                if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
3785                    int32_t endIndex = (int32_t)pos;
3786                    c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
3787
3788                    if (endIndex == pos) {
3789                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3790                    }
3791                    fCharNum += endIndex - pos;
3792                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
3793                } else {
3794                    int32_t offset = 0;
3795                    struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
3796
3797                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
3798                    c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
3799
3800                    if (offset == 0) {
3801                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3802                    } else if (context.lastOffset == offset) {
3803                        UTEXT_PREVIOUS32(fRXPat->fPattern);
3804                    } else if (context.lastOffset != offset-1) {
3805                        utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
3806                    }
3807                    fCharNum += offset;
3808                }
3809            }
3810            else if (peekCharLL() == chDigit0) {
3811                //  Octal Escape, using Java Regexp Conventions
3812                //    which are \0 followed by 1-3 octal digits.
3813                //    Different from ICU Unescape handling of Octal, which does not
3814                //    require the leading 0.
3815                //  Java also has the convention of only consuming 2 octal digits if
3816                //    the three digit number would be > 0xff
3817                //
3818                c.fChar = 0;
3819                nextCharLL();    // Consume the initial 0.
3820                int index;
3821                for (index=0; index<3; index++) {
3822                    int32_t ch = peekCharLL();
3823                    if (ch<chDigit0 || ch>chDigit7) {
3824                        if (index==0) {
3825                           // \0 is not followed by any octal digits.
3826                           error(U_REGEX_BAD_ESCAPE_SEQUENCE);
3827                        }
3828                        break;
3829                    }
3830                    c.fChar <<= 3;
3831                    c.fChar += ch&7;
3832                    if (c.fChar <= 255) {
3833                        nextCharLL();
3834                    } else {
3835                        // The last digit made the number too big.  Forget we saw it.
3836                        c.fChar >>= 3;
3837                    }
3838                }
3839                c.fQuoted = TRUE;
3840            }
3841            else if (peekCharLL() == chQ) {
3842                //  "\Q"  enter quote mode, which will continue until "\E"
3843                fQuoteMode = TRUE;
3844                nextCharLL();       // discard the 'Q'.
3845                nextChar(c);        // recurse to get the real next char.
3846            }
3847            else
3848            {
3849                // We are in a '\' escape that will be handled by the state table scanner.
3850                // Just return the backslash, but remember that the following char is to
3851                //  be taken literally.
3852                fInBackslashQuote = TRUE;
3853            }
3854        }
3855    }
3856
3857    // re-enable # to end-of-line comments, in case they were disabled.
3858    // They are disabled by the parser upon seeing '(?', but this lasts for
3859    //  the fetching of the next character only.
3860    fEOLComments = TRUE;
3861
3862    // putc(c.fChar, stdout);
3863}
3864
3865
3866
3867//------------------------------------------------------------------------------
3868//
3869//  scanNamedChar
3870 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
3871//
3872//             The scan position will be at the 'N'.  On return
3873//             the scan position should be just after the '}'
3874//
3875//             Return the UChar32
3876//
3877//------------------------------------------------------------------------------
3878UChar32  RegexCompile::scanNamedChar() {
3879    if (U_FAILURE(*fStatus)) {
3880        return 0;
3881    }
3882
3883    nextChar(fC);
3884    if (fC.fChar != chLBrace) {
3885        error(U_REGEX_PROPERTY_SYNTAX);
3886        return 0;
3887    }
3888
3889    UnicodeString  charName;
3890    for (;;) {
3891        nextChar(fC);
3892        if (fC.fChar == chRBrace) {
3893            break;
3894        }
3895        if (fC.fChar == -1) {
3896            error(U_REGEX_PROPERTY_SYNTAX);
3897            return 0;
3898        }
3899        charName.append(fC.fChar);
3900    }
3901
3902    char name[100];
3903    if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
3904         (uint32_t)charName.length()>=sizeof(name)) {
3905        // All Unicode character names have only invariant characters.
3906        // The API to get a character, given a name, accepts only char *, forcing us to convert,
3907        //   which requires this error check
3908        error(U_REGEX_PROPERTY_SYNTAX);
3909        return 0;
3910    }
3911    charName.extract(0, charName.length(), name, sizeof(name), US_INV);
3912
3913    UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
3914    if (U_FAILURE(*fStatus)) {
3915        error(U_REGEX_PROPERTY_SYNTAX);
3916    }
3917
3918    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
3919    return theChar;
3920}
3921
3922//------------------------------------------------------------------------------
3923//
3924//  scanProp   Construct a UnicodeSet from the text at the current scan
3925//             position, which will be of the form \p{whaterver}
3926//
3927//             The scan position will be at the 'p' or 'P'.  On return
3928//             the scan position should be just after the '}'
3929//
3930//             Return a UnicodeSet, constructed from the \P pattern,
3931//             or NULL if the pattern is invalid.
3932//
3933//------------------------------------------------------------------------------
3934UnicodeSet *RegexCompile::scanProp() {
3935    UnicodeSet    *uset = NULL;
3936
3937    if (U_FAILURE(*fStatus)) {
3938        return NULL;
3939    }
3940    (void)chLowerP;   // Suppress compiler unused variable warning.
3941    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
3942    UBool negated = (fC.fChar == chP);
3943
3944    UnicodeString propertyName;
3945    nextChar(fC);
3946    if (fC.fChar != chLBrace) {
3947        error(U_REGEX_PROPERTY_SYNTAX);
3948        return NULL;
3949    }
3950    for (;;) {
3951        nextChar(fC);
3952        if (fC.fChar == chRBrace) {
3953            break;
3954        }
3955        if (fC.fChar == -1) {
3956            // Hit the end of the input string without finding the closing '}'
3957            error(U_REGEX_PROPERTY_SYNTAX);
3958            return NULL;
3959        }
3960        propertyName.append(fC.fChar);
3961    }
3962    uset = createSetForProperty(propertyName, negated);
3963    nextChar(fC);    // Move input scan to position following the closing '}'
3964    return uset;
3965}
3966
3967//------------------------------------------------------------------------------
3968//
3969//  scanPosixProp   Construct a UnicodeSet from the text at the current scan
3970//             position, which is expected be of the form [:property expression:]
3971//
3972//             The scan position will be at the opening ':'.  On return
3973//             the scan position must be on the closing ']'
3974//
3975//             Return a UnicodeSet constructed from the pattern,
3976//             or NULL if this is not a valid POSIX-style set expression.
3977//             If not a property expression, restore the initial scan position
3978//                (to the opening ':')
3979//
3980//               Note:  the opening '[:' is not sufficient to guarantee that
3981//                      this is a [:property:] expression.
3982//                      [:'+=,] is a perfectly good ordinary set expression that
3983//                              happens to include ':' as one of its characters.
3984//
3985//------------------------------------------------------------------------------
3986UnicodeSet *RegexCompile::scanPosixProp() {
3987    UnicodeSet    *uset = NULL;
3988
3989    if (U_FAILURE(*fStatus)) {
3990        return NULL;
3991    }
3992
3993    U_ASSERT(fC.fChar == chColon);
3994
3995    // Save the scanner state.
3996    // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
3997    int64_t     savedScanIndex        = fScanIndex;
3998    int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
3999    UBool       savedQuoteMode        = fQuoteMode;
4000    UBool       savedInBackslashQuote = fInBackslashQuote;
4001    UBool       savedEOLComments      = fEOLComments;
4002    int64_t     savedLineNum          = fLineNum;
4003    int64_t     savedCharNum          = fCharNum;
4004    UChar32     savedLastChar         = fLastChar;
4005    UChar32     savedPeekChar         = fPeekChar;
4006    RegexPatternChar savedfC          = fC;
4007
4008    // Scan for a closing ].   A little tricky because there are some perverse
4009    //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4010    //   ending on the second closing ].
4011
4012    UnicodeString propName;
4013    UBool         negated  = FALSE;
4014
4015    // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4016    nextChar(fC);
4017    if (fC.fChar == chUp) {
4018       negated = TRUE;
4019       nextChar(fC);
4020    }
4021
4022    // Scan for the closing ":]", collecting the property name along the way.
4023    UBool  sawPropSetTerminator = FALSE;
4024    for (;;) {
4025        propName.append(fC.fChar);
4026        nextChar(fC);
4027        if (fC.fQuoted || fC.fChar == -1) {
4028            // Escaped characters or end of input - either says this isn't a [:Property:]
4029            break;
4030        }
4031        if (fC.fChar == chColon) {
4032            nextChar(fC);
4033            if (fC.fChar == chRBracket) {
4034                sawPropSetTerminator = TRUE;
4035            }
4036            break;
4037        }
4038    }
4039
4040    if (sawPropSetTerminator) {
4041        uset = createSetForProperty(propName, negated);
4042    }
4043    else
4044    {
4045        // No closing ":]".
4046        //  Restore the original scan position.
4047        //  The main scanner will retry the input as a normal set expression,
4048        //    not a [:Property:] expression.
4049        fScanIndex        = savedScanIndex;
4050        fQuoteMode        = savedQuoteMode;
4051        fInBackslashQuote = savedInBackslashQuote;
4052        fEOLComments      = savedEOLComments;
4053        fLineNum          = savedLineNum;
4054        fCharNum          = savedCharNum;
4055        fLastChar         = savedLastChar;
4056        fPeekChar         = savedPeekChar;
4057        fC                = savedfC;
4058        UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4059    }
4060    return uset;
4061}
4062
4063static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4064    set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4065    addCategory(set, U_GC_CF_MASK, ec);
4066}
4067
4068//
4069//  Create a Unicode Set from a Unicode Property expression.
4070//     This is common code underlying both \p{...} ane [:...:] expressions.
4071//     Includes trying the Java "properties" that aren't supported as
4072//     normal ICU UnicodeSet properties
4073//
4074static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4075static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
4076UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4077    UnicodeString   setExpr;
4078    UnicodeSet      *set;
4079    uint32_t        usetFlags = 0;
4080
4081    if (U_FAILURE(*fStatus)) {
4082        return NULL;
4083    }
4084
4085    //
4086    //  First try the property as we received it
4087    //
4088    if (negated) {
4089        setExpr.append(negSetPrefix, -1);
4090    } else {
4091        setExpr.append(posSetPrefix, -1);
4092    }
4093    setExpr.append(propName);
4094    setExpr.append(chRBrace);
4095    setExpr.append(chRBracket);
4096    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4097        usetFlags |= USET_CASE_INSENSITIVE;
4098    }
4099    set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4100    if (U_SUCCESS(*fStatus)) {
4101       return set;
4102    }
4103    delete set;
4104    set = NULL;
4105
4106    //
4107    //  The property as it was didn't work.
4108
4109    //  Do [:word:]. It is not recognized as a property by UnicodeSet.  "word" not standard POSIX
4110    //     or standard Java, but many other regular expression packages do recognize it.
4111
4112    if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4113        *fStatus = U_ZERO_ERROR;
4114        set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4115        if (set == NULL) {
4116            *fStatus = U_MEMORY_ALLOCATION_ERROR;
4117            return set;
4118        }
4119        if (negated) {
4120            set->complement();
4121        }
4122        return set;
4123    }
4124
4125
4126    //    Do Java fixes -
4127    //       InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4128    //       InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4129    //
4130    //       Note on Spaces:  either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4131    //                        is accepted by Java.  The property part of the name is compared
4132    //                        case-insenstively.  The spaces must be exactly as shown, either
4133    //                        all there, or all omitted, with exactly one at each position
4134    //                        if they are present.  From checking against JDK 1.6
4135    //
4136    //       This code should be removed when ICU properties support the Java  compatibility names
4137    //          (ICU 4.0?)
4138    //
4139    UnicodeString mPropName = propName;
4140    if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4141        mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4142    }
4143    if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4144        mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4145        mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4146    }
4147    else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4148        mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4149    }
4150
4151    //    See if the property looks like a Java "InBlockName", which
4152    //    we will recast as "Block=BlockName"
4153    //
4154    static const UChar IN[] = {0x49, 0x6E, 0};  // "In"
4155    static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00};  // "Block="
4156    if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4157        setExpr.truncate(4);   // Leaves "[\p{", or "[\P{"
4158        setExpr.append(BLOCK, -1);
4159        setExpr.append(UnicodeString(mPropName, 2));  // Property with the leading "In" removed.
4160        setExpr.append(chRBrace);
4161        setExpr.append(chRBracket);
4162        *fStatus = U_ZERO_ERROR;
4163        set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4164        if (U_SUCCESS(*fStatus)) {
4165            return set;
4166        }
4167        delete set;
4168        set = NULL;
4169    }
4170
4171    if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4172        propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4173    {
4174        UErrorCode localStatus = U_ZERO_ERROR;
4175        //setExpr.remove();
4176        set = new UnicodeSet();
4177        //
4178        //  Try the various Java specific properties.
4179        //   These all begin with "java"
4180        //
4181        if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4182            addCategory(set, U_GC_CN_MASK, localStatus);
4183            set->complement();
4184        }
4185        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4186            addCategory(set, U_GC_ND_MASK, localStatus);
4187        }
4188        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4189            addIdentifierIgnorable(set, localStatus);
4190        }
4191        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4192            set->add(0, 0x1F).add(0x7F, 0x9F);
4193        }
4194        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4195            addCategory(set, U_GC_L_MASK, localStatus);
4196            addCategory(set, U_GC_SC_MASK, localStatus);
4197            addCategory(set, U_GC_PC_MASK, localStatus);
4198            addCategory(set, U_GC_ND_MASK, localStatus);
4199            addCategory(set, U_GC_NL_MASK, localStatus);
4200            addCategory(set, U_GC_MC_MASK, localStatus);
4201            addCategory(set, U_GC_MN_MASK, localStatus);
4202            addIdentifierIgnorable(set, localStatus);
4203        }
4204        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4205            addCategory(set, U_GC_L_MASK, localStatus);
4206            addCategory(set, U_GC_NL_MASK, localStatus);
4207            addCategory(set, U_GC_SC_MASK, localStatus);
4208            addCategory(set, U_GC_PC_MASK, localStatus);
4209        }
4210        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4211            addCategory(set, U_GC_L_MASK, localStatus);
4212        }
4213        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4214            addCategory(set, U_GC_L_MASK, localStatus);
4215            addCategory(set, U_GC_ND_MASK, localStatus);
4216        }
4217        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4218            addCategory(set, U_GC_LL_MASK, localStatus);
4219        }
4220        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4221            set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4222        }
4223        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4224            addCategory(set, U_GC_Z_MASK, localStatus);
4225        }
4226        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4227            set->add(0x10000, UnicodeSet::MAX_VALUE);
4228        }
4229        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4230            addCategory(set, U_GC_LT_MASK, localStatus);
4231        }
4232        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4233            addCategory(set, U_GC_L_MASK, localStatus);
4234            addCategory(set, U_GC_NL_MASK, localStatus);
4235        }
4236        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4237            addCategory(set, U_GC_L_MASK, localStatus);
4238            addCategory(set, U_GC_PC_MASK, localStatus);
4239            addCategory(set, U_GC_ND_MASK, localStatus);
4240            addCategory(set, U_GC_NL_MASK, localStatus);
4241            addCategory(set, U_GC_MC_MASK, localStatus);
4242            addCategory(set, U_GC_MN_MASK, localStatus);
4243            addIdentifierIgnorable(set, localStatus);
4244        }
4245        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4246            addCategory(set, U_GC_LU_MASK, localStatus);
4247        }
4248        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4249            set->add(0, UnicodeSet::MAX_VALUE);
4250        }
4251        else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4252            addCategory(set, U_GC_Z_MASK, localStatus);
4253            set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4254            set->add(9, 0x0d).add(0x1c, 0x1f);
4255        }
4256        else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4257            set->add(0, UnicodeSet::MAX_VALUE);
4258        }
4259
4260        if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4261            *fStatus = U_ZERO_ERROR;
4262            if (usetFlags & USET_CASE_INSENSITIVE) {
4263                set->closeOver(USET_CASE_INSENSITIVE);
4264            }
4265            if (negated) {
4266                set->complement();
4267            }
4268            return set;
4269        }
4270        delete set;
4271        set = NULL;
4272    }
4273    error(*fStatus);
4274    return NULL;
4275}
4276
4277
4278
4279//
4280//  SetEval   Part of the evaluation of [set expressions].
4281//            Perform any pending (stacked) operations with precedence
4282//            equal or greater to that of the next operator encountered
4283//            in the expression.
4284//
4285void RegexCompile::setEval(int32_t nextOp) {
4286    UnicodeSet *rightOperand = NULL;
4287    UnicodeSet *leftOperand  = NULL;
4288    for (;;) {
4289        U_ASSERT(fSetOpStack.empty()==FALSE);
4290        int32_t pendingSetOperation = fSetOpStack.peeki();
4291        if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4292            break;
4293        }
4294        fSetOpStack.popi();
4295        U_ASSERT(fSetStack.empty() == FALSE);
4296        rightOperand = (UnicodeSet *)fSetStack.peek();
4297        switch (pendingSetOperation) {
4298            case setNegation:
4299                rightOperand->complement();
4300                break;
4301            case setCaseClose:
4302                // TODO: need a simple close function.  Ticket 6065
4303                rightOperand->closeOver(USET_CASE_INSENSITIVE);
4304                rightOperand->removeAllStrings();
4305                break;
4306            case setDifference1:
4307            case setDifference2:
4308                fSetStack.pop();
4309                leftOperand = (UnicodeSet *)fSetStack.peek();
4310                leftOperand->removeAll(*rightOperand);
4311                delete rightOperand;
4312                break;
4313            case setIntersection1:
4314            case setIntersection2:
4315                fSetStack.pop();
4316                leftOperand = (UnicodeSet *)fSetStack.peek();
4317                leftOperand->retainAll(*rightOperand);
4318                delete rightOperand;
4319                break;
4320            case setUnion:
4321                fSetStack.pop();
4322                leftOperand = (UnicodeSet *)fSetStack.peek();
4323                leftOperand->addAll(*rightOperand);
4324                delete rightOperand;
4325                break;
4326            default:
4327                U_ASSERT(FALSE);
4328                break;
4329            }
4330        }
4331    }
4332
4333void RegexCompile::setPushOp(int32_t op) {
4334    setEval(op);
4335    fSetOpStack.push(op, *fStatus);
4336    fSetStack.push(new UnicodeSet(), *fStatus);
4337}
4338
4339U_NAMESPACE_END
4340#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4341
4342