1/*
2**************************************************************************
3*   Copyright (C) 2002-2011 International Business Machines Corporation  *
4*   and others. All rights reserved.                                     *
5**************************************************************************
6*/
7//
8//  file:  rematch.cpp
9//
10//         Contains the implementation of class RegexMatcher,
11//         which is one of the main API classes for the ICU regular expression package.
12//
13
14#include "unicode/utypes.h"
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/regex.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/ustring.h"
21#include "unicode/rbbi.h"
22#include "uassert.h"
23#include "cmemory.h"
24#include "uvector.h"
25#include "uvectr32.h"
26#include "uvectr64.h"
27#include "regeximp.h"
28#include "regexst.h"
29#include "regextxt.h"
30#include "ucase.h"
31
32// #include <malloc.h>        // Needed for heapcheck testing
33
34
35// Find progress callback
36// ----------------------
37// Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
38//
39#define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
40    (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
41
42
43// Smart Backtracking
44// ------------------
45// When a failure would go back to a LOOP_C instruction,
46// strings, characters, and setrefs scan backwards for a valid start
47// character themselves, pop the stack, and save state, emulating the
48// LOOP_C's effect but assured that the next character of input is a
49// possible matching character.
50//
51// Good idea in theory; unfortunately it only helps out a few specific
52// cases and slows the engine down a little in the rest.
53
54//#define REGEX_SMART_BACKTRACKING 1
55
56U_NAMESPACE_BEGIN
57
58// Default limit for the size of the back track stack, to avoid system
59//    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
60// This value puts ICU's limits higher than most other regexp implementations,
61//    which use recursion rather than the heap, and take more storage per
62//    backtrack point.
63//
64static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66// Time limit counter constant.
67//   Time limits for expression evaluation are in terms of quanta of work by
68//   the engine, each of which is 10,000 state saves.
69//   This constant determines that state saves per tick number.
70static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72//-----------------------------------------------------------------------------
73//
74//   Constructor and Destructor
75//
76//-----------------------------------------------------------------------------
77RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
78    fDeferredStatus = U_ZERO_ERROR;
79    init(fDeferredStatus);
80    if (U_FAILURE(fDeferredStatus)) {
81        return;
82    }
83    if (pat==NULL) {
84        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85        return;
86    }
87    fPattern = pat;
88    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89}
90
91
92
93RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94                           uint32_t flags, UErrorCode &status) {
95    init(status);
96    if (U_FAILURE(status)) {
97        return;
98    }
99    UParseError    pe;
100    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
101    fPattern           = fPatternOwned;
102
103    UText inputText = UTEXT_INITIALIZER;
104    utext_openConstUnicodeString(&inputText, &input, &status);
105    init2(&inputText, status);
106    utext_close(&inputText);
107
108    fInputUniStrMaybeMutable = TRUE;
109}
110
111
112RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113                           uint32_t flags, UErrorCode &status) {
114    init(status);
115    if (U_FAILURE(status)) {
116        return;
117    }
118    UParseError    pe;
119    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
120    if (U_FAILURE(status)) {
121        return;
122    }
123
124    fPattern           = fPatternOwned;
125    init2(input, status);
126}
127
128
129RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130                           uint32_t flags, UErrorCode &status) {
131    init(status);
132    if (U_FAILURE(status)) {
133        return;
134    }
135    UParseError    pe;
136    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
137    if (U_FAILURE(status)) {
138        return;
139    }
140    fPattern           = fPatternOwned;
141    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142}
143
144RegexMatcher::RegexMatcher(UText *regexp,
145                           uint32_t flags, UErrorCode &status) {
146    init(status);
147    if (U_FAILURE(status)) {
148        return;
149    }
150    UParseError    pe;
151    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
152        if (U_FAILURE(status)) {
153        return;
154    }
155
156    fPattern           = fPatternOwned;
157    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158}
159
160
161
162
163RegexMatcher::~RegexMatcher() {
164    delete fStack;
165    if (fData != fSmallData) {
166        uprv_free(fData);
167        fData = NULL;
168    }
169    if (fPatternOwned) {
170        delete fPatternOwned;
171        fPatternOwned = NULL;
172        fPattern = NULL;
173    }
174
175    if (fInput) {
176        delete fInput;
177    }
178    if (fInputText) {
179        utext_close(fInputText);
180    }
181    if (fAltInputText) {
182        utext_close(fAltInputText);
183    }
184
185    #if UCONFIG_NO_BREAK_ITERATION==0
186    delete fWordBreakItr;
187    #endif
188}
189
190//
191//   init()   common initialization for use by all constructors.
192//            Initialize all fields, get the object into a consistent state.
193//            This must be done even when the initial status shows an error,
194//            so that the object is initialized sufficiently well for the destructor
195//            to run safely.
196//
197void RegexMatcher::init(UErrorCode &status) {
198    fPattern           = NULL;
199    fPatternOwned      = NULL;
200    fFrameSize         = 0;
201    fRegionStart       = 0;
202    fRegionLimit       = 0;
203    fAnchorStart       = 0;
204    fAnchorLimit       = 0;
205    fLookStart         = 0;
206    fLookLimit         = 0;
207    fActiveStart       = 0;
208    fActiveLimit       = 0;
209    fTransparentBounds = FALSE;
210    fAnchoringBounds   = TRUE;
211    fMatch             = FALSE;
212    fMatchStart        = 0;
213    fMatchEnd          = 0;
214    fLastMatchEnd      = -1;
215    fAppendPosition    = 0;
216    fHitEnd            = FALSE;
217    fRequireEnd        = FALSE;
218    fStack             = NULL;
219    fFrame             = NULL;
220    fTimeLimit         = 0;
221    fTime              = 0;
222    fTickCounter       = 0;
223    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
224    fCallbackFn        = NULL;
225    fCallbackContext   = NULL;
226    fFindProgressCallbackFn      = NULL;
227    fFindProgressCallbackContext = NULL;
228    fTraceDebug        = FALSE;
229    fDeferredStatus    = status;
230    fData              = fSmallData;
231    fWordBreakItr      = NULL;
232
233    fStack             = new UVector64(status);
234    fInputText         = NULL;
235    fAltInputText      = NULL;
236    fInput             = NULL;
237    fInputLength       = 0;
238    fInputUniStrMaybeMutable = FALSE;
239
240    if (U_FAILURE(status)) {
241        fDeferredStatus = status;
242    }
243}
244
245//
246//  init2()   Common initialization for use by RegexMatcher constructors, part 2.
247//            This handles the common setup to be done after the Pattern is available.
248//
249void RegexMatcher::init2(UText *input, UErrorCode &status) {
250    if (U_FAILURE(status)) {
251        fDeferredStatus = status;
252        return;
253    }
254
255    if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256        fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257        if (fData == NULL) {
258            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259            return;
260        }
261    }
262
263    reset(input);
264    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
265    if (U_FAILURE(status)) {
266        fDeferredStatus = status;
267        return;
268    }
269}
270
271
272static const UChar BACKSLASH  = 0x5c;
273static const UChar DOLLARSIGN = 0x24;
274//--------------------------------------------------------------------------------
275//
276//    appendReplacement
277//
278//--------------------------------------------------------------------------------
279RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280                                              const UnicodeString &replacement,
281                                              UErrorCode &status) {
282    UText replacementText = UTEXT_INITIALIZER;
283
284    utext_openConstUnicodeString(&replacementText, &replacement, &status);
285    if (U_SUCCESS(status)) {
286        UText resultText = UTEXT_INITIALIZER;
287        utext_openUnicodeString(&resultText, &dest, &status);
288
289        if (U_SUCCESS(status)) {
290            appendReplacement(&resultText, &replacementText, status);
291            utext_close(&resultText);
292        }
293        utext_close(&replacementText);
294    }
295
296    return *this;
297}
298
299//
300//    appendReplacement, UText mode
301//
302RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303                                              UText *replacement,
304                                              UErrorCode &status) {
305    if (U_FAILURE(status)) {
306        return *this;
307    }
308    if (U_FAILURE(fDeferredStatus)) {
309        status = fDeferredStatus;
310        return *this;
311    }
312    if (fMatch == FALSE) {
313        status = U_REGEX_INVALID_STATE;
314        return *this;
315    }
316
317    // Copy input string from the end of previous match to start of current match
318    int64_t  destLen = utext_nativeLength(dest);
319    if (fMatchStart > fAppendPosition) {
320        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322                                     (int32_t)(fMatchStart-fAppendPosition), &status);
323        } else {
324            int32_t len16;
325            if (UTEXT_USES_U16(fInputText)) {
326                len16 = (int32_t)(fMatchStart-fAppendPosition);
327            } else {
328                UErrorCode lengthStatus = U_ZERO_ERROR;
329                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330            }
331            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332            if (inputChars == NULL) {
333                status = U_MEMORY_ALLOCATION_ERROR;
334                return *this;
335            }
336            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338            uprv_free(inputChars);
339        }
340    }
341    fAppendPosition = fMatchEnd;
342
343
344    // scan the replacement text, looking for substitutions ($n) and \escapes.
345    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
346    //         move entire ranges not containing substitutions.
347    UTEXT_SETNATIVEINDEX(replacement, 0);
348    UChar32 c = UTEXT_NEXT32(replacement);
349    while (c != U_SENTINEL) {
350        if (c == BACKSLASH) {
351            // Backslash Escape.  Copy the following char out without further checks.
352            //                    Note:  Surrogate pairs don't need any special handling
353            //                           The second half wont be a '$' or a '\', and
354            //                           will move to the dest normally on the next
355            //                           loop iteration.
356            c = UTEXT_CURRENT32(replacement);
357            if (c == U_SENTINEL) {
358                break;
359            }
360
361            if (c==0x55/*U*/ || c==0x75/*u*/) {
362                // We have a \udddd or \Udddddddd escape sequence.
363                int32_t offset = 0;
364                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
365                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
366                if (escapedChar != (UChar32)0xFFFFFFFF) {
367                    if (U_IS_BMP(escapedChar)) {
368                        UChar c16 = (UChar)escapedChar;
369                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
370                    } else {
371                        UChar surrogate[2];
372                        surrogate[0] = U16_LEAD(escapedChar);
373                        surrogate[1] = U16_TRAIL(escapedChar);
374                        if (U_SUCCESS(status)) {
375                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
376                        }
377                    }
378                    // TODO:  Report errors for mal-formed \u escapes?
379                    //        As this is, the original sequence is output, which may be OK.
380                    if (context.lastOffset == offset) {
381                        (void)UTEXT_PREVIOUS32(replacement);
382                    } else if (context.lastOffset != offset-1) {
383                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384                    }
385                }
386            } else {
387                (void)UTEXT_NEXT32(replacement);
388                // Plain backslash escape.  Just put out the escaped character.
389                if (U_IS_BMP(c)) {
390                    UChar c16 = (UChar)c;
391                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
392                } else {
393                    UChar surrogate[2];
394                    surrogate[0] = U16_LEAD(c);
395                    surrogate[1] = U16_TRAIL(c);
396                    if (U_SUCCESS(status)) {
397                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398                    }
399                }
400            }
401        } else if (c != DOLLARSIGN) {
402            // Normal char, not a $.  Copy it out without further checks.
403            if (U_IS_BMP(c)) {
404                UChar c16 = (UChar)c;
405                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
406            } else {
407                UChar surrogate[2];
408                surrogate[0] = U16_LEAD(c);
409                surrogate[1] = U16_TRAIL(c);
410                if (U_SUCCESS(status)) {
411                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412                }
413            }
414        } else {
415            // We've got a $.  Pick up a capture group number if one follows.
416            // Consume at most the number of digits necessary for the largest capture
417            // number that is valid for this pattern.
418
419            int32_t numDigits = 0;
420            int32_t groupNum  = 0;
421            UChar32 digitC;
422            for (;;) {
423                digitC = UTEXT_CURRENT32(replacement);
424                if (digitC == U_SENTINEL) {
425                    break;
426                }
427                if (u_isdigit(digitC) == FALSE) {
428                    break;
429                }
430                (void)UTEXT_NEXT32(replacement);
431                groupNum=groupNum*10 + u_charDigitValue(digitC);
432                numDigits++;
433                if (numDigits >= fPattern->fMaxCaptureDigits) {
434                    break;
435                }
436            }
437
438
439            if (numDigits == 0) {
440                // The $ didn't introduce a group number at all.
441                // Treat it as just part of the substitution text.
442                UChar c16 = DOLLARSIGN;
443                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
444            } else {
445                // Finally, append the capture group data to the destination.
446                destLen += appendGroup(groupNum, dest, status);
447                if (U_FAILURE(status)) {
448                    // Can fail if group number is out of range.
449                    break;
450                }
451            }
452        }
453
454        if (U_FAILURE(status)) {
455            break;
456        } else {
457            c = UTEXT_NEXT32(replacement);
458        }
459    }
460
461    return *this;
462}
463
464
465
466//--------------------------------------------------------------------------------
467//
468//    appendTail     Intended to be used in conjunction with appendReplacement()
469//                   To the destination string, append everything following
470//                   the last match position from the input string.
471//
472//                   Note:  Match ranges do not affect appendTail or appendReplacement
473//
474//--------------------------------------------------------------------------------
475UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
476    UErrorCode status = U_ZERO_ERROR;
477    UText resultText = UTEXT_INITIALIZER;
478    utext_openUnicodeString(&resultText, &dest, &status);
479
480    if (U_SUCCESS(status)) {
481        appendTail(&resultText, status);
482        utext_close(&resultText);
483    }
484
485    return dest;
486}
487
488//
489//   appendTail, UText mode
490//
491UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
492    UBool bailOut = FALSE;
493    if (U_FAILURE(status)) {
494        bailOut = TRUE;
495    }
496    if (U_FAILURE(fDeferredStatus)) {
497        status = fDeferredStatus;
498        bailOut = TRUE;
499    }
500
501    if (bailOut) {
502        //  dest must not be NULL
503        if (dest) {
504            utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
505            return dest;
506        }
507    }
508
509    if (fInputLength > fAppendPosition) {
510        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
511            int64_t destLen = utext_nativeLength(dest);
512            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
513                          (int32_t)(fInputLength-fAppendPosition), &status);
514        } else {
515            int32_t len16;
516            if (UTEXT_USES_U16(fInputText)) {
517                len16 = (int32_t)(fInputLength-fAppendPosition);
518            } else {
519                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
520                status = U_ZERO_ERROR; // buffer overflow
521            }
522
523            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
524            if (inputChars == NULL) {
525                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
526            } else {
527                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
528                int64_t destLen = utext_nativeLength(dest);
529                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
530                uprv_free(inputChars);
531            }
532        }
533    }
534    return dest;
535}
536
537
538
539//--------------------------------------------------------------------------------
540//
541//   end
542//
543//--------------------------------------------------------------------------------
544int32_t RegexMatcher::end(UErrorCode &err) const {
545    return end(0, err);
546}
547
548int64_t RegexMatcher::end64(UErrorCode &err) const {
549    return end64(0, err);
550}
551
552int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
553    if (U_FAILURE(err)) {
554        return -1;
555    }
556    if (fMatch == FALSE) {
557        err = U_REGEX_INVALID_STATE;
558        return -1;
559    }
560    if (group < 0 || group > fPattern->fGroupMap->size()) {
561        err = U_INDEX_OUTOFBOUNDS_ERROR;
562        return -1;
563    }
564    int64_t e = -1;
565    if (group == 0) {
566        e = fMatchEnd;
567    } else {
568        // Get the position within the stack frame of the variables for
569        //    this capture group.
570        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
571        U_ASSERT(groupOffset < fPattern->fFrameSize);
572        U_ASSERT(groupOffset >= 0);
573        e = fFrame->fExtra[groupOffset + 1];
574    }
575
576        return e;
577}
578
579int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
580    return (int32_t)end64(group, err);
581}
582
583
584//--------------------------------------------------------------------------------
585//
586//   find()
587//
588//--------------------------------------------------------------------------------
589UBool RegexMatcher::find() {
590    // Start at the position of the last match end.  (Will be zero if the
591    //   matcher has been reset.)
592    //
593    if (U_FAILURE(fDeferredStatus)) {
594        return FALSE;
595    }
596
597    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
598        return findUsingChunk();
599    }
600
601    int64_t startPos = fMatchEnd;
602    if (startPos==0) {
603        startPos = fActiveStart;
604    }
605
606    if (fMatch) {
607        // Save the position of any previous successful match.
608        fLastMatchEnd = fMatchEnd;
609
610        if (fMatchStart == fMatchEnd) {
611            // Previous match had zero length.  Move start position up one position
612            //  to avoid sending find() into a loop on zero-length matches.
613            if (startPos >= fActiveLimit) {
614                fMatch = FALSE;
615                fHitEnd = TRUE;
616                return FALSE;
617            }
618            UTEXT_SETNATIVEINDEX(fInputText, startPos);
619            (void)UTEXT_NEXT32(fInputText);
620            startPos = UTEXT_GETNATIVEINDEX(fInputText);
621        }
622    } else {
623        if (fLastMatchEnd >= 0) {
624            // A previous find() failed to match.  Don't try again.
625            //   (without this test, a pattern with a zero-length match
626            //    could match again at the end of an input string.)
627            fHitEnd = TRUE;
628            return FALSE;
629        }
630    }
631
632
633    // Compute the position in the input string beyond which a match can not begin, because
634    //   the minimum length match would extend past the end of the input.
635    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
636    //          Be aware of possible overflows if making changes here.
637    int64_t testStartLimit;
638    if (UTEXT_USES_U16(fInputText)) {
639        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
640        if (startPos > testStartLimit) {
641            fMatch = FALSE;
642            fHitEnd = TRUE;
643            return FALSE;
644        }
645    } else {
646        // For now, let the matcher discover that it can't match on its own
647        // We don't know how long the match len is in native characters
648        testStartLimit = fActiveLimit;
649    }
650
651    UChar32  c;
652    U_ASSERT(startPos >= 0);
653
654    switch (fPattern->fStartType) {
655    case START_NO_INFO:
656        // No optimization was found.
657        //  Try a match at each input position.
658        for (;;) {
659            MatchAt(startPos, FALSE, fDeferredStatus);
660            if (U_FAILURE(fDeferredStatus)) {
661                return FALSE;
662            }
663            if (fMatch) {
664                return TRUE;
665            }
666            if (startPos >= testStartLimit) {
667                fHitEnd = TRUE;
668                return FALSE;
669            }
670            UTEXT_SETNATIVEINDEX(fInputText, startPos);
671            (void)UTEXT_NEXT32(fInputText);
672            startPos = UTEXT_GETNATIVEINDEX(fInputText);
673            // Note that it's perfectly OK for a pattern to have a zero-length
674            //   match at the end of a string, so we must make sure that the loop
675            //   runs with startPos == testStartLimit the last time through.
676            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
677                return FALSE;
678        }
679        U_ASSERT(FALSE);
680
681    case START_START:
682        // Matches are only possible at the start of the input string
683        //   (pattern begins with ^ or \A)
684        if (startPos > fActiveStart) {
685            fMatch = FALSE;
686            return FALSE;
687        }
688        MatchAt(startPos, FALSE, fDeferredStatus);
689        if (U_FAILURE(fDeferredStatus)) {
690            return FALSE;
691        }
692        return fMatch;
693
694
695    case START_SET:
696        {
697            // Match may start on any char from a pre-computed set.
698            U_ASSERT(fPattern->fMinMatchLen > 0);
699            int64_t pos;
700            UTEXT_SETNATIVEINDEX(fInputText, startPos);
701            for (;;) {
702                c = UTEXT_NEXT32(fInputText);
703                pos = UTEXT_GETNATIVEINDEX(fInputText);
704                // c will be -1 (U_SENTINEL) at end of text, in which case we
705                // skip this next block (so we don't have a negative array index)
706                // and handle end of text in the following block.
707                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
708                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
709                    MatchAt(startPos, FALSE, fDeferredStatus);
710                    if (U_FAILURE(fDeferredStatus)) {
711                        return FALSE;
712                    }
713                    if (fMatch) {
714                        return TRUE;
715                    }
716                    UTEXT_SETNATIVEINDEX(fInputText, pos);
717                }
718                if (startPos >= testStartLimit) {
719                    fMatch = FALSE;
720                    fHitEnd = TRUE;
721                    return FALSE;
722                }
723                startPos = pos;
724	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
725                    return FALSE;
726            }
727        }
728        U_ASSERT(FALSE);
729
730    case START_STRING:
731    case START_CHAR:
732        {
733            // Match starts on exactly one char.
734            U_ASSERT(fPattern->fMinMatchLen > 0);
735            UChar32 theChar = fPattern->fInitialChar;
736            int64_t pos;
737            UTEXT_SETNATIVEINDEX(fInputText, startPos);
738            for (;;) {
739                c = UTEXT_NEXT32(fInputText);
740                pos = UTEXT_GETNATIVEINDEX(fInputText);
741                if (c == theChar) {
742                    MatchAt(startPos, FALSE, fDeferredStatus);
743                    if (U_FAILURE(fDeferredStatus)) {
744                        return FALSE;
745                    }
746                    if (fMatch) {
747                        return TRUE;
748                    }
749                    UTEXT_SETNATIVEINDEX(fInputText, pos);
750                }
751                if (startPos >= testStartLimit) {
752                    fMatch = FALSE;
753                    fHitEnd = TRUE;
754                    return FALSE;
755                }
756                startPos = pos;
757	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
758                    return FALSE;
759           }
760        }
761        U_ASSERT(FALSE);
762
763    case START_LINE:
764        {
765            UChar32  c;
766            if (startPos == fAnchorStart) {
767                MatchAt(startPos, FALSE, fDeferredStatus);
768                if (U_FAILURE(fDeferredStatus)) {
769                    return FALSE;
770                }
771                if (fMatch) {
772                    return TRUE;
773                }
774                UTEXT_SETNATIVEINDEX(fInputText, startPos);
775                c = UTEXT_NEXT32(fInputText);
776                startPos = UTEXT_GETNATIVEINDEX(fInputText);
777            } else {
778                UTEXT_SETNATIVEINDEX(fInputText, startPos);
779                c = UTEXT_PREVIOUS32(fInputText);
780                UTEXT_SETNATIVEINDEX(fInputText, startPos);
781            }
782
783            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
784                for (;;) {
785                    if (c == 0x0a) {
786                            MatchAt(startPos, FALSE, fDeferredStatus);
787                            if (U_FAILURE(fDeferredStatus)) {
788                                return FALSE;
789                            }
790                            if (fMatch) {
791                                return TRUE;
792                            }
793                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
794                    }
795                    if (startPos >= testStartLimit) {
796                        fMatch = FALSE;
797                        fHitEnd = TRUE;
798                        return FALSE;
799                    }
800                    c = UTEXT_NEXT32(fInputText);
801                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
802                    // Note that it's perfectly OK for a pattern to have a zero-length
803                    //   match at the end of a string, so we must make sure that the loop
804                    //   runs with startPos == testStartLimit the last time through.
805		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
806                        return FALSE;
807                }
808            } else {
809                for (;;) {
810                    if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
811                        ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
812                            if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
813                                (void)UTEXT_NEXT32(fInputText);
814                                startPos = UTEXT_GETNATIVEINDEX(fInputText);
815                            }
816                            MatchAt(startPos, FALSE, fDeferredStatus);
817                            if (U_FAILURE(fDeferredStatus)) {
818                                return FALSE;
819                            }
820                            if (fMatch) {
821                                return TRUE;
822                            }
823                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
824                    }
825                    if (startPos >= testStartLimit) {
826                        fMatch = FALSE;
827                        fHitEnd = TRUE;
828                        return FALSE;
829                    }
830                    c = UTEXT_NEXT32(fInputText);
831                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
832                    // Note that it's perfectly OK for a pattern to have a zero-length
833                    //   match at the end of a string, so we must make sure that the loop
834                    //   runs with startPos == testStartLimit the last time through.
835		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
836                        return FALSE;
837                }
838            }
839        }
840
841    default:
842        U_ASSERT(FALSE);
843    }
844
845    U_ASSERT(FALSE);
846    return FALSE;
847}
848
849
850
851UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
852    if (U_FAILURE(status)) {
853        return FALSE;
854    }
855    if (U_FAILURE(fDeferredStatus)) {
856        status = fDeferredStatus;
857        return FALSE;
858    }
859    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
860                                          //        This will reset the region to be the full input length.
861    if (start < 0) {
862        status = U_INDEX_OUTOFBOUNDS_ERROR;
863        return FALSE;
864    }
865
866    int64_t nativeStart = start;
867    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
868        status = U_INDEX_OUTOFBOUNDS_ERROR;
869        return FALSE;
870    }
871    fMatchEnd = nativeStart;
872    return find();
873}
874
875
876//--------------------------------------------------------------------------------
877//
878//   findUsingChunk() -- like find(), but with the advance knowledge that the
879//                       entire string is available in the UText's chunk buffer.
880//
881//--------------------------------------------------------------------------------
882UBool RegexMatcher::findUsingChunk() {
883    // Start at the position of the last match end.  (Will be zero if the
884    //   matcher has been reset.
885    //
886
887    int32_t startPos = (int32_t)fMatchEnd;
888    if (startPos==0) {
889        startPos = (int32_t)fActiveStart;
890    }
891
892    const UChar *inputBuf = fInputText->chunkContents;
893
894    if (fMatch) {
895        // Save the position of any previous successful match.
896        fLastMatchEnd = fMatchEnd;
897
898        if (fMatchStart == fMatchEnd) {
899            // Previous match had zero length.  Move start position up one position
900            //  to avoid sending find() into a loop on zero-length matches.
901            if (startPos >= fActiveLimit) {
902                fMatch = FALSE;
903                fHitEnd = TRUE;
904                return FALSE;
905            }
906            U16_FWD_1(inputBuf, startPos, fInputLength);
907        }
908    } else {
909        if (fLastMatchEnd >= 0) {
910            // A previous find() failed to match.  Don't try again.
911            //   (without this test, a pattern with a zero-length match
912            //    could match again at the end of an input string.)
913            fHitEnd = TRUE;
914            return FALSE;
915        }
916    }
917
918
919    // Compute the position in the input string beyond which a match can not begin, because
920    //   the minimum length match would extend past the end of the input.
921    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
922    //          Be aware of possible overflows if making changes here.
923    int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
924    if (startPos > testLen) {
925        fMatch = FALSE;
926        fHitEnd = TRUE;
927        return FALSE;
928    }
929
930    UChar32  c;
931    U_ASSERT(startPos >= 0);
932
933    switch (fPattern->fStartType) {
934    case START_NO_INFO:
935        // No optimization was found.
936        //  Try a match at each input position.
937        for (;;) {
938            MatchChunkAt(startPos, FALSE, fDeferredStatus);
939            if (U_FAILURE(fDeferredStatus)) {
940                return FALSE;
941            }
942            if (fMatch) {
943                return TRUE;
944            }
945            if (startPos >= testLen) {
946                fHitEnd = TRUE;
947                return FALSE;
948            }
949            U16_FWD_1(inputBuf, startPos, fActiveLimit);
950            // Note that it's perfectly OK for a pattern to have a zero-length
951            //   match at the end of a string, so we must make sure that the loop
952            //   runs with startPos == testLen the last time through.
953            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
954                return FALSE;
955        }
956        U_ASSERT(FALSE);
957
958    case START_START:
959        // Matches are only possible at the start of the input string
960        //   (pattern begins with ^ or \A)
961        if (startPos > fActiveStart) {
962            fMatch = FALSE;
963            return FALSE;
964        }
965        MatchChunkAt(startPos, FALSE, fDeferredStatus);
966        if (U_FAILURE(fDeferredStatus)) {
967            return FALSE;
968        }
969        return fMatch;
970
971
972    case START_SET:
973    {
974        // Match may start on any char from a pre-computed set.
975        U_ASSERT(fPattern->fMinMatchLen > 0);
976        for (;;) {
977            int32_t pos = startPos;
978            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
979            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
980                (c>=256 && fPattern->fInitialChars->contains(c))) {
981                MatchChunkAt(pos, FALSE, fDeferredStatus);
982                if (U_FAILURE(fDeferredStatus)) {
983                    return FALSE;
984                }
985                if (fMatch) {
986                    return TRUE;
987                }
988            }
989            if (pos >= testLen) {
990                fMatch = FALSE;
991                fHitEnd = TRUE;
992                return FALSE;
993            }
994            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
995                return FALSE;
996        }
997    }
998        U_ASSERT(FALSE);
999
1000    case START_STRING:
1001    case START_CHAR:
1002    {
1003        // Match starts on exactly one char.
1004        U_ASSERT(fPattern->fMinMatchLen > 0);
1005        UChar32 theChar = fPattern->fInitialChar;
1006        for (;;) {
1007            int32_t pos = startPos;
1008            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1009            if (c == theChar) {
1010                MatchChunkAt(pos, FALSE, fDeferredStatus);
1011                if (U_FAILURE(fDeferredStatus)) {
1012                    return FALSE;
1013                }
1014                if (fMatch) {
1015                    return TRUE;
1016                }
1017            }
1018            if (pos >= testLen) {
1019                fMatch = FALSE;
1020                fHitEnd = TRUE;
1021                return FALSE;
1022            }
1023            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1024                return FALSE;
1025        }
1026    }
1027        U_ASSERT(FALSE);
1028
1029    case START_LINE:
1030    {
1031        UChar32  c;
1032        if (startPos == fAnchorStart) {
1033            MatchChunkAt(startPos, FALSE, fDeferredStatus);
1034            if (U_FAILURE(fDeferredStatus)) {
1035                return FALSE;
1036            }
1037            if (fMatch) {
1038                return TRUE;
1039            }
1040            U16_FWD_1(inputBuf, startPos, fActiveLimit);
1041        }
1042
1043        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1044            for (;;) {
1045                c = inputBuf[startPos-1];
1046                if (c == 0x0a) {
1047                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1048                    if (U_FAILURE(fDeferredStatus)) {
1049                        return FALSE;
1050                    }
1051                    if (fMatch) {
1052                        return TRUE;
1053                    }
1054                }
1055                if (startPos >= testLen) {
1056                    fMatch = FALSE;
1057                    fHitEnd = TRUE;
1058                    return FALSE;
1059                }
1060                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1061                // Note that it's perfectly OK for a pattern to have a zero-length
1062                //   match at the end of a string, so we must make sure that the loop
1063                //   runs with startPos == testLen the last time through.
1064	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1065                    return FALSE;
1066            }
1067        } else {
1068            for (;;) {
1069                c = inputBuf[startPos-1];
1070                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1071                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1072                    if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1073                        startPos++;
1074                    }
1075                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1076                    if (U_FAILURE(fDeferredStatus)) {
1077                        return FALSE;
1078                    }
1079                    if (fMatch) {
1080                        return TRUE;
1081                    }
1082                }
1083                if (startPos >= testLen) {
1084                    fMatch = FALSE;
1085                    fHitEnd = TRUE;
1086                    return FALSE;
1087                }
1088                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089                // Note that it's perfectly OK for a pattern to have a zero-length
1090                //   match at the end of a string, so we must make sure that the loop
1091                //   runs with startPos == testLen the last time through.
1092	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1093                    return FALSE;
1094            }
1095        }
1096    }
1097
1098    default:
1099        U_ASSERT(FALSE);
1100    }
1101
1102    U_ASSERT(FALSE);
1103    return FALSE;
1104}
1105
1106
1107
1108//--------------------------------------------------------------------------------
1109//
1110//  group()
1111//
1112//--------------------------------------------------------------------------------
1113UnicodeString RegexMatcher::group(UErrorCode &status) const {
1114    return group(0, status);
1115}
1116
1117//  Return immutable shallow clone
1118UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1119    return group(0, dest, group_len, status);
1120}
1121
1122//  Return immutable shallow clone
1123UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1124    group_len = 0;
1125    UBool bailOut = FALSE;
1126    if (U_FAILURE(status)) {
1127        return dest;
1128    }
1129    if (U_FAILURE(fDeferredStatus)) {
1130        status = fDeferredStatus;
1131        bailOut = TRUE;
1132    }
1133    if (fMatch == FALSE) {
1134        status = U_REGEX_INVALID_STATE;
1135        bailOut = TRUE;
1136    }
1137    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1138        status = U_INDEX_OUTOFBOUNDS_ERROR;
1139        bailOut = TRUE;
1140    }
1141
1142    if (bailOut) {
1143        return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1144    }
1145
1146    int64_t s, e;
1147    if (groupNum == 0) {
1148        s = fMatchStart;
1149        e = fMatchEnd;
1150    } else {
1151        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1152        U_ASSERT(groupOffset < fPattern->fFrameSize);
1153        U_ASSERT(groupOffset >= 0);
1154        s = fFrame->fExtra[groupOffset];
1155        e = fFrame->fExtra[groupOffset+1];
1156    }
1157
1158    if (s < 0) {
1159        // A capture group wasn't part of the match
1160        return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1161    }
1162    U_ASSERT(s <= e);
1163    group_len = e - s;
1164
1165    dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1166    if (dest)
1167        UTEXT_SETNATIVEINDEX(dest, s);
1168    return dest;
1169}
1170
1171UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1172    UnicodeString result;
1173    if (U_FAILURE(status)) {
1174        return result;
1175    }
1176    UText resultText = UTEXT_INITIALIZER;
1177    utext_openUnicodeString(&resultText, &result, &status);
1178    group(groupNum, &resultText, status);
1179    utext_close(&resultText);
1180    return result;
1181}
1182
1183
1184//  Return deep (mutable) clone
1185//		Technology Preview (as an API), but note that the UnicodeString API is implemented
1186//		using this function.
1187UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1188    UBool bailOut = FALSE;
1189    if (U_FAILURE(status)) {
1190        return dest;
1191    }
1192    if (U_FAILURE(fDeferredStatus)) {
1193        status = fDeferredStatus;
1194        bailOut = TRUE;
1195    }
1196
1197    if (fMatch == FALSE) {
1198        status = U_REGEX_INVALID_STATE;
1199        bailOut = TRUE;
1200    }
1201    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202        status = U_INDEX_OUTOFBOUNDS_ERROR;
1203        bailOut = TRUE;
1204    }
1205
1206    if (bailOut) {
1207        if (dest) {
1208            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1209            return dest;
1210        } else {
1211            return utext_openUChars(NULL, NULL, 0, &status);
1212        }
1213    }
1214
1215    int64_t s, e;
1216    if (groupNum == 0) {
1217        s = fMatchStart;
1218        e = fMatchEnd;
1219    } else {
1220        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1221        U_ASSERT(groupOffset < fPattern->fFrameSize);
1222        U_ASSERT(groupOffset >= 0);
1223        s = fFrame->fExtra[groupOffset];
1224        e = fFrame->fExtra[groupOffset+1];
1225    }
1226
1227    if (s < 0) {
1228        // A capture group wasn't part of the match
1229        if (dest) {
1230            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1231            return dest;
1232        } else {
1233            return utext_openUChars(NULL, NULL, 0, &status);
1234        }
1235    }
1236    U_ASSERT(s <= e);
1237
1238    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1239        U_ASSERT(e <= fInputLength);
1240        if (dest) {
1241            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1242        } else {
1243            UText groupText = UTEXT_INITIALIZER;
1244            utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1245            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1246            utext_close(&groupText);
1247        }
1248    } else {
1249        int32_t len16;
1250        if (UTEXT_USES_U16(fInputText)) {
1251            len16 = (int32_t)(e-s);
1252        } else {
1253            UErrorCode lengthStatus = U_ZERO_ERROR;
1254            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1255        }
1256        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1257        if (groupChars == NULL) {
1258            status = U_MEMORY_ALLOCATION_ERROR;
1259            return dest;
1260        }
1261        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1262
1263        if (dest) {
1264            utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1265        } else {
1266            UText groupText = UTEXT_INITIALIZER;
1267            utext_openUChars(&groupText, groupChars, len16, &status);
1268            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1269            utext_close(&groupText);
1270        }
1271
1272        uprv_free(groupChars);
1273    }
1274    return dest;
1275}
1276
1277//--------------------------------------------------------------------------------
1278//
1279//  appendGroup() -- currently internal only, appends a group to a UText rather
1280//                   than replacing its contents
1281//
1282//--------------------------------------------------------------------------------
1283
1284int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1285    if (U_FAILURE(status)) {
1286        return 0;
1287    }
1288    if (U_FAILURE(fDeferredStatus)) {
1289        status = fDeferredStatus;
1290        return 0;
1291    }
1292    int64_t destLen = utext_nativeLength(dest);
1293
1294    if (fMatch == FALSE) {
1295        status = U_REGEX_INVALID_STATE;
1296        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297    }
1298    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1299        status = U_INDEX_OUTOFBOUNDS_ERROR;
1300        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1301    }
1302
1303    int64_t s, e;
1304    if (groupNum == 0) {
1305        s = fMatchStart;
1306        e = fMatchEnd;
1307    } else {
1308        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1309        U_ASSERT(groupOffset < fPattern->fFrameSize);
1310        U_ASSERT(groupOffset >= 0);
1311        s = fFrame->fExtra[groupOffset];
1312        e = fFrame->fExtra[groupOffset+1];
1313    }
1314
1315    if (s < 0) {
1316        // A capture group wasn't part of the match
1317        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1318    }
1319    U_ASSERT(s <= e);
1320
1321    int64_t deltaLen;
1322    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1323        U_ASSERT(e <= fInputLength);
1324        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1325    } else {
1326        int32_t len16;
1327        if (UTEXT_USES_U16(fInputText)) {
1328            len16 = (int32_t)(e-s);
1329        } else {
1330            UErrorCode lengthStatus = U_ZERO_ERROR;
1331            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1332        }
1333        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1334        if (groupChars == NULL) {
1335            status = U_MEMORY_ALLOCATION_ERROR;
1336            return 0;
1337        }
1338        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1339
1340        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1341        uprv_free(groupChars);
1342    }
1343    return deltaLen;
1344}
1345
1346
1347
1348//--------------------------------------------------------------------------------
1349//
1350//  groupCount()
1351//
1352//--------------------------------------------------------------------------------
1353int32_t RegexMatcher::groupCount() const {
1354    return fPattern->fGroupMap->size();
1355}
1356
1357
1358
1359//--------------------------------------------------------------------------------
1360//
1361//  hasAnchoringBounds()
1362//
1363//--------------------------------------------------------------------------------
1364UBool RegexMatcher::hasAnchoringBounds() const {
1365    return fAnchoringBounds;
1366}
1367
1368
1369//--------------------------------------------------------------------------------
1370//
1371//  hasTransparentBounds()
1372//
1373//--------------------------------------------------------------------------------
1374UBool RegexMatcher::hasTransparentBounds() const {
1375    return fTransparentBounds;
1376}
1377
1378
1379
1380//--------------------------------------------------------------------------------
1381//
1382//  hitEnd()
1383//
1384//--------------------------------------------------------------------------------
1385UBool RegexMatcher::hitEnd() const {
1386    return fHitEnd;
1387}
1388
1389
1390//--------------------------------------------------------------------------------
1391//
1392//  input()
1393//
1394//--------------------------------------------------------------------------------
1395const UnicodeString &RegexMatcher::input() const {
1396    if (!fInput) {
1397        UErrorCode status = U_ZERO_ERROR;
1398        int32_t len16;
1399        if (UTEXT_USES_U16(fInputText)) {
1400            len16 = (int32_t)fInputLength;
1401        } else {
1402            len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1403            status = U_ZERO_ERROR; // overflow, length status
1404        }
1405        UnicodeString *result = new UnicodeString(len16, 0, 0);
1406
1407        UChar *inputChars = result->getBuffer(len16);
1408        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1409        result->releaseBuffer(len16);
1410
1411        (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1412    }
1413
1414    return *fInput;
1415}
1416
1417//--------------------------------------------------------------------------------
1418//
1419//  inputText()
1420//
1421//--------------------------------------------------------------------------------
1422UText *RegexMatcher::inputText() const {
1423    return fInputText;
1424}
1425
1426
1427//--------------------------------------------------------------------------------
1428//
1429//  getInput() -- like inputText(), but makes a clone or copies into another UText
1430//
1431//--------------------------------------------------------------------------------
1432UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1433    UBool bailOut = FALSE;
1434    if (U_FAILURE(status)) {
1435        return dest;
1436    }
1437    if (U_FAILURE(fDeferredStatus)) {
1438        status = fDeferredStatus;
1439        bailOut = TRUE;
1440    }
1441
1442    if (bailOut) {
1443        if (dest) {
1444            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1445            return dest;
1446        } else {
1447            return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1448        }
1449    }
1450
1451    if (dest) {
1452        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1453            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1454        } else {
1455            int32_t input16Len;
1456            if (UTEXT_USES_U16(fInputText)) {
1457                input16Len = (int32_t)fInputLength;
1458            } else {
1459                UErrorCode lengthStatus = U_ZERO_ERROR;
1460                input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1461            }
1462            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1463            if (inputChars == NULL) {
1464                return dest;
1465            }
1466
1467            status = U_ZERO_ERROR;
1468            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1469            status = U_ZERO_ERROR;
1470            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1471
1472            uprv_free(inputChars);
1473        }
1474        return dest;
1475    } else {
1476        return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1477    }
1478}
1479
1480
1481static UBool compat_SyncMutableUTextContents(UText *ut);
1482static UBool compat_SyncMutableUTextContents(UText *ut) {
1483    UBool retVal = FALSE;
1484
1485    //  In the following test, we're really only interested in whether the UText should switch
1486    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1487    //  will still point to the correct data.
1488    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1489        UnicodeString *us=(UnicodeString *)ut->context;
1490
1491        // Update to the latest length.
1492        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493        int32_t newLength = us->length();
1494
1495        // Update the chunk description.
1496        // The buffer may have switched between stack- and heap-based.
1497        ut->chunkContents    = us->getBuffer();
1498        ut->chunkLength      = newLength;
1499        ut->chunkNativeLimit = newLength;
1500        ut->nativeIndexingLimit = newLength;
1501        retVal = TRUE;
1502    }
1503
1504    return retVal;
1505}
1506
1507//--------------------------------------------------------------------------------
1508//
1509//  lookingAt()
1510//
1511//--------------------------------------------------------------------------------
1512UBool RegexMatcher::lookingAt(UErrorCode &status) {
1513    if (U_FAILURE(status)) {
1514        return FALSE;
1515    }
1516    if (U_FAILURE(fDeferredStatus)) {
1517        status = fDeferredStatus;
1518        return FALSE;
1519    }
1520
1521    if (fInputUniStrMaybeMutable) {
1522        if (compat_SyncMutableUTextContents(fInputText)) {
1523        fInputLength = utext_nativeLength(fInputText);
1524        reset();
1525        }
1526    }
1527    else {
1528        resetPreserveRegion();
1529    }
1530    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1531        MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1532    } else {
1533        MatchAt(fActiveStart, FALSE, status);
1534    }
1535    return fMatch;
1536}
1537
1538
1539UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1540    if (U_FAILURE(status)) {
1541        return FALSE;
1542    }
1543    if (U_FAILURE(fDeferredStatus)) {
1544        status = fDeferredStatus;
1545        return FALSE;
1546    }
1547    reset();
1548
1549    if (start < 0) {
1550        status = U_INDEX_OUTOFBOUNDS_ERROR;
1551        return FALSE;
1552    }
1553
1554    if (fInputUniStrMaybeMutable) {
1555        if (compat_SyncMutableUTextContents(fInputText)) {
1556        fInputLength = utext_nativeLength(fInputText);
1557        reset();
1558        }
1559    }
1560
1561    int64_t nativeStart;
1562    nativeStart = start;
1563    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1564        status = U_INDEX_OUTOFBOUNDS_ERROR;
1565        return FALSE;
1566    }
1567
1568    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1569        MatchChunkAt((int32_t)nativeStart, FALSE, status);
1570    } else {
1571        MatchAt(nativeStart, FALSE, status);
1572    }
1573    return fMatch;
1574}
1575
1576
1577
1578//--------------------------------------------------------------------------------
1579//
1580//  matches()
1581//
1582//--------------------------------------------------------------------------------
1583UBool RegexMatcher::matches(UErrorCode &status) {
1584    if (U_FAILURE(status)) {
1585        return FALSE;
1586    }
1587    if (U_FAILURE(fDeferredStatus)) {
1588        status = fDeferredStatus;
1589        return FALSE;
1590    }
1591
1592    if (fInputUniStrMaybeMutable) {
1593        if (compat_SyncMutableUTextContents(fInputText)) {
1594        fInputLength = utext_nativeLength(fInputText);
1595        reset();
1596        }
1597    }
1598    else {
1599        resetPreserveRegion();
1600    }
1601
1602    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1603        MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1604    } else {
1605        MatchAt(fActiveStart, TRUE, status);
1606    }
1607    return fMatch;
1608}
1609
1610
1611UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1612    if (U_FAILURE(status)) {
1613        return FALSE;
1614    }
1615    if (U_FAILURE(fDeferredStatus)) {
1616        status = fDeferredStatus;
1617        return FALSE;
1618    }
1619    reset();
1620
1621    if (start < 0) {
1622        status = U_INDEX_OUTOFBOUNDS_ERROR;
1623        return FALSE;
1624    }
1625
1626    if (fInputUniStrMaybeMutable) {
1627        if (compat_SyncMutableUTextContents(fInputText)) {
1628        fInputLength = utext_nativeLength(fInputText);
1629        reset();
1630        }
1631    }
1632
1633    int64_t nativeStart;
1634    nativeStart = start;
1635    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1636        status = U_INDEX_OUTOFBOUNDS_ERROR;
1637        return FALSE;
1638    }
1639
1640    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1641        MatchChunkAt((int32_t)nativeStart, TRUE, status);
1642    } else {
1643        MatchAt(nativeStart, TRUE, status);
1644    }
1645    return fMatch;
1646}
1647
1648
1649
1650//--------------------------------------------------------------------------------
1651//
1652//    pattern
1653//
1654//--------------------------------------------------------------------------------
1655const RegexPattern &RegexMatcher::pattern() const {
1656    return *fPattern;
1657}
1658
1659
1660
1661//--------------------------------------------------------------------------------
1662//
1663//    region
1664//
1665//--------------------------------------------------------------------------------
1666RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1667    if (U_FAILURE(status)) {
1668        return *this;
1669    }
1670
1671    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1672        status = U_ILLEGAL_ARGUMENT_ERROR;
1673    }
1674
1675    int64_t nativeStart = regionStart;
1676    int64_t nativeLimit = regionLimit;
1677    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1678      status = U_ILLEGAL_ARGUMENT_ERROR;
1679    }
1680
1681    if (startIndex == -1)
1682      this->reset();
1683    else
1684      resetPreserveRegion();
1685
1686    fRegionStart = nativeStart;
1687    fRegionLimit = nativeLimit;
1688    fActiveStart = nativeStart;
1689    fActiveLimit = nativeLimit;
1690
1691    if (startIndex != -1) {
1692      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1693          status = U_INDEX_OUTOFBOUNDS_ERROR;
1694      }
1695      fMatchEnd = startIndex;
1696    }
1697
1698    if (!fTransparentBounds) {
1699        fLookStart = nativeStart;
1700        fLookLimit = nativeLimit;
1701    }
1702    if (fAnchoringBounds) {
1703        fAnchorStart = nativeStart;
1704        fAnchorLimit = nativeLimit;
1705    }
1706    return *this;
1707}
1708
1709RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1710  return region(start, limit, -1, status);
1711}
1712
1713//--------------------------------------------------------------------------------
1714//
1715//    regionEnd
1716//
1717//--------------------------------------------------------------------------------
1718int32_t RegexMatcher::regionEnd() const {
1719    return (int32_t)fRegionLimit;
1720}
1721
1722int64_t RegexMatcher::regionEnd64() const {
1723    return fRegionLimit;
1724}
1725
1726//--------------------------------------------------------------------------------
1727//
1728//    regionStart
1729//
1730//--------------------------------------------------------------------------------
1731int32_t RegexMatcher::regionStart() const {
1732    return (int32_t)fRegionStart;
1733}
1734
1735int64_t RegexMatcher::regionStart64() const {
1736    return fRegionStart;
1737}
1738
1739
1740//--------------------------------------------------------------------------------
1741//
1742//    replaceAll
1743//
1744//--------------------------------------------------------------------------------
1745UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1746    UText replacementText = UTEXT_INITIALIZER;
1747    UText resultText = UTEXT_INITIALIZER;
1748    UnicodeString resultString;
1749    if (U_FAILURE(status)) {
1750        return resultString;
1751    }
1752
1753    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754    utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756    replaceAll(&replacementText, &resultText, status);
1757
1758    utext_close(&resultText);
1759    utext_close(&replacementText);
1760
1761    return resultString;
1762}
1763
1764
1765//
1766//    replaceAll, UText mode
1767//
1768UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1769    if (U_FAILURE(status)) {
1770        return dest;
1771    }
1772    if (U_FAILURE(fDeferredStatus)) {
1773        status = fDeferredStatus;
1774        return dest;
1775    }
1776
1777    if (dest == NULL) {
1778        UnicodeString emptyString;
1779        UText empty = UTEXT_INITIALIZER;
1780
1781        utext_openUnicodeString(&empty, &emptyString, &status);
1782        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783        utext_close(&empty);
1784    }
1785
1786    if (U_SUCCESS(status)) {
1787        reset();
1788        while (find()) {
1789            appendReplacement(dest, replacement, status);
1790            if (U_FAILURE(status)) {
1791                break;
1792            }
1793        }
1794        appendTail(dest, status);
1795    }
1796
1797    return dest;
1798}
1799
1800
1801//--------------------------------------------------------------------------------
1802//
1803//    replaceFirst
1804//
1805//--------------------------------------------------------------------------------
1806UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1807    UText replacementText = UTEXT_INITIALIZER;
1808    UText resultText = UTEXT_INITIALIZER;
1809    UnicodeString resultString;
1810
1811    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1812    utext_openUnicodeString(&resultText, &resultString, &status);
1813
1814    replaceFirst(&replacementText, &resultText, status);
1815
1816    utext_close(&resultText);
1817    utext_close(&replacementText);
1818
1819    return resultString;
1820}
1821
1822//
1823//    replaceFirst, UText mode
1824//
1825UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1826    if (U_FAILURE(status)) {
1827        return dest;
1828    }
1829    if (U_FAILURE(fDeferredStatus)) {
1830        status = fDeferredStatus;
1831        return dest;
1832    }
1833
1834    reset();
1835    if (!find()) {
1836        return getInput(dest, status);
1837    }
1838
1839    if (dest == NULL) {
1840        UnicodeString emptyString;
1841        UText empty = UTEXT_INITIALIZER;
1842
1843        utext_openUnicodeString(&empty, &emptyString, &status);
1844        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1845        utext_close(&empty);
1846    }
1847
1848    appendReplacement(dest, replacement, status);
1849    appendTail(dest, status);
1850
1851    return dest;
1852}
1853
1854
1855//--------------------------------------------------------------------------------
1856//
1857//     requireEnd
1858//
1859//--------------------------------------------------------------------------------
1860UBool RegexMatcher::requireEnd() const {
1861    return fRequireEnd;
1862}
1863
1864
1865//--------------------------------------------------------------------------------
1866//
1867//     reset
1868//
1869//--------------------------------------------------------------------------------
1870RegexMatcher &RegexMatcher::reset() {
1871    fRegionStart    = 0;
1872    fRegionLimit    = fInputLength;
1873    fActiveStart    = 0;
1874    fActiveLimit    = fInputLength;
1875    fAnchorStart    = 0;
1876    fAnchorLimit    = fInputLength;
1877    fLookStart      = 0;
1878    fLookLimit      = fInputLength;
1879    resetPreserveRegion();
1880    return *this;
1881}
1882
1883
1884
1885void RegexMatcher::resetPreserveRegion() {
1886    fMatchStart     = 0;
1887    fMatchEnd       = 0;
1888    fLastMatchEnd   = -1;
1889    fAppendPosition = 0;
1890    fMatch          = FALSE;
1891    fHitEnd         = FALSE;
1892    fRequireEnd     = FALSE;
1893    fTime           = 0;
1894    fTickCounter    = TIMER_INITIAL_VALUE;
1895    //resetStack(); // more expensive than it looks...
1896}
1897
1898
1899RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1900    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1901    if (fPattern->fNeedsAltInput) {
1902        fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1903    }
1904    fInputLength = utext_nativeLength(fInputText);
1905
1906    reset();
1907    delete fInput;
1908    fInput = NULL;
1909
1910    //  Do the following for any UnicodeString.
1911    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1912    fInputUniStrMaybeMutable = TRUE;
1913
1914    if (fWordBreakItr != NULL) {
1915#if UCONFIG_NO_BREAK_ITERATION==0
1916        UErrorCode status = U_ZERO_ERROR;
1917        fWordBreakItr->setText(fInputText, status);
1918#endif
1919    }
1920    return *this;
1921}
1922
1923
1924RegexMatcher &RegexMatcher::reset(UText *input) {
1925    if (fInputText != input) {
1926        fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1927        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1928        fInputLength = utext_nativeLength(fInputText);
1929
1930        delete fInput;
1931        fInput = NULL;
1932
1933        if (fWordBreakItr != NULL) {
1934#if UCONFIG_NO_BREAK_ITERATION==0
1935            UErrorCode status = U_ZERO_ERROR;
1936            fWordBreakItr->setText(input, status);
1937#endif
1938        }
1939    }
1940    reset();
1941    fInputUniStrMaybeMutable = FALSE;
1942
1943    return *this;
1944}
1945
1946/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1948    return *this;
1949}*/
1950
1951RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1952    if (U_FAILURE(status)) {
1953        return *this;
1954    }
1955    reset();       // Reset also resets the region to be the entire string.
1956
1957    if (position < 0 || position > fActiveLimit) {
1958        status = U_INDEX_OUTOFBOUNDS_ERROR;
1959        return *this;
1960    }
1961    fMatchEnd = position;
1962    return *this;
1963}
1964
1965
1966//--------------------------------------------------------------------------------
1967//
1968//    refresh
1969//
1970//--------------------------------------------------------------------------------
1971RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1972    if (U_FAILURE(status)) {
1973        return *this;
1974    }
1975    if (input == NULL) {
1976        status = U_ILLEGAL_ARGUMENT_ERROR;
1977        return *this;
1978    }
1979    if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1980        status = U_ILLEGAL_ARGUMENT_ERROR;
1981        return *this;
1982    }
1983    int64_t  pos = utext_getNativeIndex(fInputText);
1984    //  Shallow read-only clone of the new UText into the existing input UText
1985    fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1986    if (U_FAILURE(status)) {
1987        return *this;
1988    }
1989    utext_setNativeIndex(fInputText, pos);
1990
1991    if (fAltInputText != NULL) {
1992        pos = utext_getNativeIndex(fAltInputText);
1993        fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1994        if (U_FAILURE(status)) {
1995            return *this;
1996        }
1997        utext_setNativeIndex(fAltInputText, pos);
1998    }
1999    return *this;
2000}
2001
2002
2003
2004//--------------------------------------------------------------------------------
2005//
2006//    setTrace
2007//
2008//--------------------------------------------------------------------------------
2009void RegexMatcher::setTrace(UBool state) {
2010    fTraceDebug = state;
2011}
2012
2013
2014
2015//---------------------------------------------------------------------
2016//
2017//   split
2018//
2019//---------------------------------------------------------------------
2020int32_t  RegexMatcher::split(const UnicodeString &input,
2021        UnicodeString    dest[],
2022        int32_t          destCapacity,
2023        UErrorCode      &status)
2024{
2025    UText inputText = UTEXT_INITIALIZER;
2026    utext_openConstUnicodeString(&inputText, &input, &status);
2027    if (U_FAILURE(status)) {
2028        return 0;
2029    }
2030
2031    UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2032    if (destText == NULL) {
2033        status = U_MEMORY_ALLOCATION_ERROR;
2034        return 0;
2035    }
2036    int32_t i;
2037    for (i = 0; i < destCapacity; i++) {
2038        destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2039    }
2040
2041    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2042
2043    for (i = 0; i < destCapacity; i++) {
2044        utext_close(destText[i]);
2045    }
2046
2047    uprv_free(destText);
2048    utext_close(&inputText);
2049    return fieldCount;
2050}
2051
2052//
2053//   split, UText mode
2054//
2055int32_t  RegexMatcher::split(UText *input,
2056        UText           *dest[],
2057        int32_t          destCapacity,
2058        UErrorCode      &status)
2059{
2060    //
2061    // Check arguements for validity
2062    //
2063    if (U_FAILURE(status)) {
2064        return 0;
2065    };
2066
2067    if (destCapacity < 1) {
2068        status = U_ILLEGAL_ARGUMENT_ERROR;
2069        return 0;
2070    }
2071
2072    //
2073    // Reset for the input text
2074    //
2075    reset(input);
2076    int64_t   nextOutputStringStart = 0;
2077    if (fActiveLimit == 0) {
2078        return 0;
2079    }
2080
2081    //
2082    // Loop through the input text, searching for the delimiter pattern
2083    //
2084    int32_t i;
2085    int32_t numCaptureGroups = fPattern->fGroupMap->size();
2086    for (i=0; ; i++) {
2087        if (i>=destCapacity-1) {
2088            // There is one or zero output string left.
2089            // Fill the last output string with whatever is left from the input, then exit the loop.
2090            //  ( i will be == destCapacity if we filled the output array while processing
2091            //    capture groups of the delimiter expression, in which case we will discard the
2092            //    last capture group saved in favor of the unprocessed remainder of the
2093            //    input string.)
2094            i = destCapacity-1;
2095            if (fActiveLimit > nextOutputStringStart) {
2096                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2097                    if (dest[i]) {
2098                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2099                                      input->chunkContents+nextOutputStringStart,
2100                                      (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2101                    } else {
2102                        UText remainingText = UTEXT_INITIALIZER;
2103                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2104                                         fActiveLimit-nextOutputStringStart, &status);
2105                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2106                        utext_close(&remainingText);
2107                    }
2108                } else {
2109                    UErrorCode lengthStatus = U_ZERO_ERROR;
2110                    int32_t remaining16Length =
2111                        utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2112                    UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2113                    if (remainingChars == NULL) {
2114                        status = U_MEMORY_ALLOCATION_ERROR;
2115                        break;
2116                    }
2117
2118                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2119                    if (dest[i]) {
2120                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2121                    } else {
2122                        UText remainingText = UTEXT_INITIALIZER;
2123                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2124                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2125                        utext_close(&remainingText);
2126                    }
2127
2128                    uprv_free(remainingChars);
2129                }
2130            }
2131            break;
2132        }
2133        if (find()) {
2134            // We found another delimiter.  Move everything from where we started looking
2135            //  up until the start of the delimiter into the next output string.
2136            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2137                if (dest[i]) {
2138                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2139                                  input->chunkContents+nextOutputStringStart,
2140                                  (int32_t)(fMatchStart-nextOutputStringStart), &status);
2141                } else {
2142                    UText remainingText = UTEXT_INITIALIZER;
2143                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2144                                      fMatchStart-nextOutputStringStart, &status);
2145                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2146                    utext_close(&remainingText);
2147                }
2148            } else {
2149                UErrorCode lengthStatus = U_ZERO_ERROR;
2150                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2151                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2152                if (remainingChars == NULL) {
2153                    status = U_MEMORY_ALLOCATION_ERROR;
2154                    break;
2155                }
2156                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2157                if (dest[i]) {
2158                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2159                } else {
2160                    UText remainingText = UTEXT_INITIALIZER;
2161                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2162                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2163                    utext_close(&remainingText);
2164                }
2165
2166                uprv_free(remainingChars);
2167            }
2168            nextOutputStringStart = fMatchEnd;
2169
2170            // If the delimiter pattern has capturing parentheses, the captured
2171            //  text goes out into the next n destination strings.
2172            int32_t groupNum;
2173            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2174                if (i >= destCapacity-2) {
2175                    // Never fill the last available output string with capture group text.
2176                    // It will filled with the last field, the remainder of the
2177                    //  unsplit input text.
2178                    break;
2179                }
2180                i++;
2181                dest[i] = group(groupNum, dest[i], status);
2182            }
2183
2184            if (nextOutputStringStart == fActiveLimit) {
2185                // The delimiter was at the end of the string.  We're done, but first
2186                // we output one last empty string, for the empty field following
2187                //   the delimiter at the end of input.
2188                if (i+1 < destCapacity) {
2189                    ++i;
2190                    if (dest[i] == NULL) {
2191                        dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2192                    } else {
2193                        static UChar emptyString[] = {(UChar)0};
2194                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2195                    }
2196                }
2197                break;
2198
2199            }
2200        }
2201        else
2202        {
2203            // We ran off the end of the input while looking for the next delimiter.
2204            // All the remaining text goes into the current output string.
2205            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2206                if (dest[i]) {
2207                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2208                                  input->chunkContents+nextOutputStringStart,
2209                                  (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2210                } else {
2211                    UText remainingText = UTEXT_INITIALIZER;
2212                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2213                                     fActiveLimit-nextOutputStringStart, &status);
2214                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2215                    utext_close(&remainingText);
2216                }
2217            } else {
2218                UErrorCode lengthStatus = U_ZERO_ERROR;
2219                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2220                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2221                if (remainingChars == NULL) {
2222                    status = U_MEMORY_ALLOCATION_ERROR;
2223                    break;
2224                }
2225
2226                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2227                if (dest[i]) {
2228                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2229                } else {
2230                    UText remainingText = UTEXT_INITIALIZER;
2231                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2232                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2233                    utext_close(&remainingText);
2234                }
2235
2236                uprv_free(remainingChars);
2237            }
2238            break;
2239        }
2240        if (U_FAILURE(status)) {
2241            break;
2242        }
2243    }   // end of for loop
2244    return i+1;
2245}
2246
2247
2248//--------------------------------------------------------------------------------
2249//
2250//     start
2251//
2252//--------------------------------------------------------------------------------
2253int32_t RegexMatcher::start(UErrorCode &status) const {
2254    return start(0, status);
2255}
2256
2257int64_t RegexMatcher::start64(UErrorCode &status) const {
2258    return start64(0, status);
2259}
2260
2261//--------------------------------------------------------------------------------
2262//
2263//     start(int32_t group, UErrorCode &status)
2264//
2265//--------------------------------------------------------------------------------
2266
2267int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2268    if (U_FAILURE(status)) {
2269        return -1;
2270    }
2271    if (U_FAILURE(fDeferredStatus)) {
2272        status = fDeferredStatus;
2273        return -1;
2274    }
2275    if (fMatch == FALSE) {
2276        status = U_REGEX_INVALID_STATE;
2277        return -1;
2278    }
2279    if (group < 0 || group > fPattern->fGroupMap->size()) {
2280        status = U_INDEX_OUTOFBOUNDS_ERROR;
2281        return -1;
2282    }
2283    int64_t s;
2284    if (group == 0) {
2285        s = fMatchStart;
2286    } else {
2287        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2288        U_ASSERT(groupOffset < fPattern->fFrameSize);
2289        U_ASSERT(groupOffset >= 0);
2290        s = fFrame->fExtra[groupOffset];
2291    }
2292
2293    return s;
2294}
2295
2296
2297int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2298    return (int32_t)start64(group, status);
2299}
2300
2301//--------------------------------------------------------------------------------
2302//
2303//     useAnchoringBounds
2304//
2305//--------------------------------------------------------------------------------
2306RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2307    fAnchoringBounds = b;
2308    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2309    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2310    return *this;
2311}
2312
2313
2314//--------------------------------------------------------------------------------
2315//
2316//     useTransparentBounds
2317//
2318//--------------------------------------------------------------------------------
2319RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2320    fTransparentBounds = b;
2321    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2322    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2323    return *this;
2324}
2325
2326//--------------------------------------------------------------------------------
2327//
2328//     setTimeLimit
2329//
2330//--------------------------------------------------------------------------------
2331void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2332    if (U_FAILURE(status)) {
2333        return;
2334    }
2335    if (U_FAILURE(fDeferredStatus)) {
2336        status = fDeferredStatus;
2337        return;
2338    }
2339    if (limit < 0) {
2340        status = U_ILLEGAL_ARGUMENT_ERROR;
2341        return;
2342    }
2343    fTimeLimit = limit;
2344}
2345
2346
2347//--------------------------------------------------------------------------------
2348//
2349//     getTimeLimit
2350//
2351//--------------------------------------------------------------------------------
2352int32_t RegexMatcher::getTimeLimit() const {
2353    return fTimeLimit;
2354}
2355
2356
2357//--------------------------------------------------------------------------------
2358//
2359//     setStackLimit
2360//
2361//--------------------------------------------------------------------------------
2362void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2363    if (U_FAILURE(status)) {
2364        return;
2365    }
2366    if (U_FAILURE(fDeferredStatus)) {
2367        status = fDeferredStatus;
2368        return;
2369    }
2370    if (limit < 0) {
2371        status = U_ILLEGAL_ARGUMENT_ERROR;
2372        return;
2373    }
2374
2375    // Reset the matcher.  This is needed here in case there is a current match
2376    //    whose final stack frame (containing the match results, pointed to by fFrame)
2377    //    would be lost by resizing to a smaller stack size.
2378    reset();
2379
2380    if (limit == 0) {
2381        // Unlimited stack expansion
2382        fStack->setMaxCapacity(0);
2383    } else {
2384        // Change the units of the limit  from bytes to ints, and bump the size up
2385        //   to be big enough to hold at least one stack frame for the pattern,
2386        //   if it isn't there already.
2387        int32_t adjustedLimit = limit / sizeof(int32_t);
2388        if (adjustedLimit < fPattern->fFrameSize) {
2389            adjustedLimit = fPattern->fFrameSize;
2390        }
2391        fStack->setMaxCapacity(adjustedLimit);
2392    }
2393    fStackLimit = limit;
2394}
2395
2396
2397//--------------------------------------------------------------------------------
2398//
2399//     getStackLimit
2400//
2401//--------------------------------------------------------------------------------
2402int32_t RegexMatcher::getStackLimit() const {
2403    return fStackLimit;
2404}
2405
2406
2407//--------------------------------------------------------------------------------
2408//
2409//     setMatchCallback
2410//
2411//--------------------------------------------------------------------------------
2412void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2413                                    const void              *context,
2414                                    UErrorCode              &status) {
2415    if (U_FAILURE(status)) {
2416        return;
2417    }
2418    fCallbackFn = callback;
2419    fCallbackContext = context;
2420}
2421
2422
2423//--------------------------------------------------------------------------------
2424//
2425//     getMatchCallback
2426//
2427//--------------------------------------------------------------------------------
2428void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2429                                  const void              *&context,
2430                                  UErrorCode              &status) {
2431    if (U_FAILURE(status)) {
2432       return;
2433    }
2434    callback = fCallbackFn;
2435    context  = fCallbackContext;
2436}
2437
2438
2439//--------------------------------------------------------------------------------
2440//
2441//     setMatchCallback
2442//
2443//--------------------------------------------------------------------------------
2444void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2445                                                const void                      *context,
2446                                                UErrorCode                      &status) {
2447    if (U_FAILURE(status)) {
2448        return;
2449    }
2450    fFindProgressCallbackFn = callback;
2451    fFindProgressCallbackContext = context;
2452}
2453
2454
2455//--------------------------------------------------------------------------------
2456//
2457//     getMatchCallback
2458//
2459//--------------------------------------------------------------------------------
2460void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2461                                                const void                    *&context,
2462                                                UErrorCode                    &status) {
2463    if (U_FAILURE(status)) {
2464       return;
2465    }
2466    callback = fFindProgressCallbackFn;
2467    context  = fFindProgressCallbackContext;
2468}
2469
2470
2471//================================================================================
2472//
2473//    Code following this point in this file is the internal
2474//    Match Engine Implementation.
2475//
2476//================================================================================
2477
2478
2479//--------------------------------------------------------------------------------
2480//
2481//   resetStack
2482//           Discard any previous contents of the state save stack, and initialize a
2483//           new stack frame to all -1.  The -1s are needed for capture group limits,
2484//           where they indicate that a group has not yet matched anything.
2485//--------------------------------------------------------------------------------
2486REStackFrame *RegexMatcher::resetStack() {
2487    // Discard any previous contents of the state save stack, and initialize a
2488    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2489    //  where they indicate that a group has not yet matched anything.
2490    fStack->removeAllElements();
2491
2492    REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2493    int32_t i;
2494    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2495        iFrame->fExtra[i] = -1;
2496    }
2497    return iFrame;
2498}
2499
2500
2501
2502//--------------------------------------------------------------------------------
2503//
2504//   isWordBoundary
2505//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2506//                     For us,
2507//                       If the current char is a combining mark,
2508//                          \b is FALSE.
2509//                       Else Scan backwards to the first non-combining char.
2510//                            We are at a boundary if the this char and the original chars are
2511//                               opposite in membership in \w set
2512//
2513//          parameters:   pos   - the current position in the input buffer
2514//
2515//              TODO:  double-check edge cases at region boundaries.
2516//
2517//--------------------------------------------------------------------------------
2518UBool RegexMatcher::isWordBoundary(int64_t pos) {
2519    UBool isBoundary = FALSE;
2520    UBool cIsWord    = FALSE;
2521
2522    if (pos >= fLookLimit) {
2523        fHitEnd = TRUE;
2524    } else {
2525        // Determine whether char c at current position is a member of the word set of chars.
2526        // If we're off the end of the string, behave as though we're not at a word char.
2527        UTEXT_SETNATIVEINDEX(fInputText, pos);
2528        UChar32  c = UTEXT_CURRENT32(fInputText);
2529        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2530            // Current char is a combining one.  Not a boundary.
2531            return FALSE;
2532        }
2533        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2534    }
2535
2536    // Back up until we come to a non-combining char, determine whether
2537    //  that char is a word char.
2538    UBool prevCIsWord = FALSE;
2539    for (;;) {
2540        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2541            break;
2542        }
2543        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2544        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2545              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2546            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2547            break;
2548        }
2549    }
2550    isBoundary = cIsWord ^ prevCIsWord;
2551    return isBoundary;
2552}
2553
2554UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2555    UBool isBoundary = FALSE;
2556    UBool cIsWord    = FALSE;
2557
2558    const UChar *inputBuf = fInputText->chunkContents;
2559
2560    if (pos >= fLookLimit) {
2561        fHitEnd = TRUE;
2562    } else {
2563        // Determine whether char c at current position is a member of the word set of chars.
2564        // If we're off the end of the string, behave as though we're not at a word char.
2565        UChar32 c;
2566        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2567        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2568            // Current char is a combining one.  Not a boundary.
2569            return FALSE;
2570        }
2571        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2572    }
2573
2574    // Back up until we come to a non-combining char, determine whether
2575    //  that char is a word char.
2576    UBool prevCIsWord = FALSE;
2577    for (;;) {
2578        if (pos <= fLookStart) {
2579            break;
2580        }
2581        UChar32 prevChar;
2582        U16_PREV(inputBuf, fLookStart, pos, prevChar);
2583        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2584              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2585            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2586            break;
2587        }
2588    }
2589    isBoundary = cIsWord ^ prevCIsWord;
2590    return isBoundary;
2591}
2592
2593//--------------------------------------------------------------------------------
2594//
2595//   isUWordBoundary
2596//
2597//         Test for a word boundary using RBBI word break.
2598//
2599//          parameters:   pos   - the current position in the input buffer
2600//
2601//--------------------------------------------------------------------------------
2602UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2603    UBool       returnVal = FALSE;
2604#if UCONFIG_NO_BREAK_ITERATION==0
2605
2606    // If we haven't yet created a break iterator for this matcher, do it now.
2607    if (fWordBreakItr == NULL) {
2608        fWordBreakItr =
2609            (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2610        if (U_FAILURE(fDeferredStatus)) {
2611            return FALSE;
2612        }
2613        fWordBreakItr->setText(fInputText, fDeferredStatus);
2614    }
2615
2616    if (pos >= fLookLimit) {
2617        fHitEnd = TRUE;
2618        returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2619                            //    words are not boundaries.  All non-word chars stand by themselves,
2620                            //    with word boundaries on both sides.
2621    } else {
2622        if (!UTEXT_USES_U16(fInputText)) {
2623            // !!!: Would like a better way to do this!
2624            UErrorCode status = U_ZERO_ERROR;
2625            pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2626        }
2627        returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2628    }
2629#endif
2630    return   returnVal;
2631}
2632
2633//--------------------------------------------------------------------------------
2634//
2635//   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2636//                     saves. Increment the "time" counter, and call the
2637//                     user callback function if there is one installed.
2638//
2639//                     If the match operation needs to be aborted, either for a time-out
2640//                     or because the user callback asked for it, just set an error status.
2641//                     The engine will pick that up and stop in its outer loop.
2642//
2643//--------------------------------------------------------------------------------
2644void RegexMatcher::IncrementTime(UErrorCode &status) {
2645    fTickCounter = TIMER_INITIAL_VALUE;
2646    fTime++;
2647    if (fCallbackFn != NULL) {
2648        if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2649            status = U_REGEX_STOPPED_BY_CALLER;
2650            return;
2651        }
2652    }
2653    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2654        status = U_REGEX_TIME_OUT;
2655    }
2656}
2657
2658//--------------------------------------------------------------------------------
2659//
2660//   ReportFindProgress     This function is called once for each advance in the target
2661//                          string from the find() function, and calls the user progress callback
2662//                          function if there is one installed.
2663//
2664//                          NOTE:
2665//
2666//                          If the match operation needs to be aborted because the user
2667//                          callback asked for it, just set an error status.
2668//                          The engine will pick that up and stop in its outer loop.
2669//
2670//--------------------------------------------------------------------------------
2671UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2672    if (fFindProgressCallbackFn != NULL) {
2673        if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2674            status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2675            return FALSE;
2676        }
2677    }
2678    return TRUE;
2679}
2680
2681//--------------------------------------------------------------------------------
2682//
2683//   StateSave
2684//       Make a new stack frame, initialized as a copy of the current stack frame.
2685//       Set the pattern index in the original stack frame from the operand value
2686//       in the opcode.  Execution of the engine continues with the state in
2687//       the newly created stack frame
2688//
2689//       Note that reserveBlock() may grow the stack, resulting in the
2690//       whole thing being relocated in memory.
2691//
2692//    Parameters:
2693//       fp           The top frame pointer when called.  At return, a new
2694//                    fame will be present
2695//       savePatIdx   An index into the compiled pattern.  Goes into the original
2696//                    (not new) frame.  If execution ever back-tracks out of the
2697//                    new frame, this will be where we continue from in the pattern.
2698//    Return
2699//                    The new frame pointer.
2700//
2701//--------------------------------------------------------------------------------
2702inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2703    // push storage for a new frame.
2704    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2705    if (newFP == NULL) {
2706        // Failure on attempted stack expansion.
2707        //   Stack function set some other error code, change it to a more
2708        //   specific one for regular expressions.
2709        status = U_REGEX_STACK_OVERFLOW;
2710        // We need to return a writable stack frame, so just return the
2711        //    previous frame.  The match operation will stop quickly
2712        //    because of the error status, after which the frame will never
2713        //    be looked at again.
2714        return fp;
2715    }
2716    fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2717
2718    // New stack frame = copy of old top frame.
2719    int64_t *source = (int64_t *)fp;
2720    int64_t *dest   = newFP;
2721    for (;;) {
2722        *dest++ = *source++;
2723        if (source == newFP) {
2724            break;
2725        }
2726    }
2727
2728    fTickCounter--;
2729    if (fTickCounter <= 0) {
2730       IncrementTime(status);    // Re-initializes fTickCounter
2731    }
2732    fp->fPatIdx = savePatIdx;
2733    return (REStackFrame *)newFP;
2734}
2735
2736
2737//--------------------------------------------------------------------------------
2738//
2739//   MatchAt      This is the actual matching engine.
2740//
2741//                  startIdx:    begin matching a this index.
2742//                  toEnd:       if true, match must extend to end of the input region
2743//
2744//--------------------------------------------------------------------------------
2745void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2746    UBool       isMatch  = FALSE;      // True if the we have a match.
2747
2748    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2749
2750    int32_t     op;                    // Operation from the compiled pattern, split into
2751    int32_t     opType;                //    the opcode
2752    int32_t     opValue;               //    and the operand value.
2753
2754    #ifdef REGEX_RUN_DEBUG
2755    if (fTraceDebug)
2756    {
2757        printf("MatchAt(startIdx=%ld)\n", startIdx);
2758        printf("Original Pattern: ");
2759        UChar32 c = utext_next32From(fPattern->fPattern, 0);
2760        while (c != U_SENTINEL) {
2761            if (c<32 || c>256) {
2762                c = '.';
2763            }
2764            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2765
2766            c = UTEXT_NEXT32(fPattern->fPattern);
2767        }
2768        printf("\n");
2769        printf("Input String: ");
2770        c = utext_next32From(fInputText, 0);
2771        while (c != U_SENTINEL) {
2772            if (c<32 || c>256) {
2773                c = '.';
2774            }
2775            printf("%c", c);
2776
2777            c = UTEXT_NEXT32(fInputText);
2778        }
2779        printf("\n");
2780        printf("\n");
2781    }
2782    #endif
2783
2784    if (U_FAILURE(status)) {
2785        return;
2786    }
2787
2788    //  Cache frequently referenced items from the compiled pattern
2789    //
2790    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2791
2792    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2793    UVector             *sets          = fPattern->fSets;
2794
2795    fFrameSize = fPattern->fFrameSize;
2796    REStackFrame        *fp            = resetStack();
2797
2798    fp->fPatIdx   = 0;
2799    fp->fInputIdx = startIdx;
2800
2801    // Zero out the pattern's static data
2802    int32_t i;
2803    for (i = 0; i<fPattern->fDataSize; i++) {
2804        fData[i] = 0;
2805    }
2806
2807    //
2808    //  Main loop for interpreting the compiled pattern.
2809    //  One iteration of the loop per pattern operation performed.
2810    //
2811    for (;;) {
2812#if 0
2813        if (_heapchk() != _HEAPOK) {
2814            fprintf(stderr, "Heap Trouble\n");
2815        }
2816#endif
2817
2818        op      = (int32_t)pat[fp->fPatIdx];
2819        opType  = URX_TYPE(op);
2820        opValue = URX_VAL(op);
2821        #ifdef REGEX_RUN_DEBUG
2822        if (fTraceDebug) {
2823            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2824            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
2825                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2826            fPattern->dumpOp(fp->fPatIdx);
2827        }
2828        #endif
2829        fp->fPatIdx++;
2830
2831        switch (opType) {
2832
2833
2834        case URX_NOP:
2835            break;
2836
2837
2838        case URX_BACKTRACK:
2839            // Force a backtrack.  In some circumstances, the pattern compiler
2840            //   will notice that the pattern can't possibly match anything, and will
2841            //   emit one of these at that point.
2842            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2843            break;
2844
2845
2846        case URX_ONECHAR:
2847            if (fp->fInputIdx < fActiveLimit) {
2848                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2849                UChar32 c = UTEXT_NEXT32(fInputText);
2850                if (c == opValue) {
2851                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2852                    break;
2853                }
2854            } else {
2855                fHitEnd = TRUE;
2856            }
2857
2858            #ifdef REGEX_SMART_BACKTRACKING
2859            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
2860                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2861                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2862                    UBool success = FALSE;
2863                    UChar32 c = UTEXT_PREVIOUS32(fInputText);
2864                    while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
2865                        if (c == opValue) {
2866                            success = TRUE;
2867                            break;
2868                        } else if (c == U_SENTINEL) {
2869                            break;
2870                        }
2871                        c = UTEXT_PREVIOUS32(fInputText);
2872                    }
2873                    if (success) {
2874                        fHitEnd = FALSE;
2875                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2876                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2877                        if (fp->fInputIdx > backSearchIndex) {
2878                            fp = StateSave(fp, fp->fPatIdx, status);
2879                        }
2880                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
2881                        break;
2882                    }
2883                }
2884            }
2885            #endif
2886
2887            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2888            break;
2889
2890
2891        case URX_STRING:
2892            {
2893                // Test input against a literal string.
2894                // Strings require two slots in the compiled pattern, one for the
2895                //   offset to the string text, and one for the length.
2896                int32_t   stringStartIdx = opValue;
2897                int32_t   stringLen;
2898
2899                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2900                fp->fPatIdx++;
2901                opType    = URX_TYPE(op);
2902                stringLen = URX_VAL(op);
2903                U_ASSERT(opType == URX_STRING_LEN);
2904                U_ASSERT(stringLen >= 2);
2905
2906                const UChar *patternChars = litText+stringStartIdx;
2907                const UChar *patternEnd = patternChars+stringLen;
2908
2909                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2910                UChar32 c;
2911                UBool success = TRUE;
2912
2913                while (patternChars < patternEnd && success) {
2914                    c = UTEXT_NEXT32(fInputText);
2915
2916                    if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
2917                        if (U_IS_BMP(c)) {
2918                            success = (*patternChars == c);
2919                            patternChars += 1;
2920                        } else if (patternChars+1 < patternEnd) {
2921                            success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
2922                            patternChars += 2;
2923                        }
2924                    } else {
2925                        success = FALSE;
2926                        fHitEnd = TRUE;          //   TODO:  See ticket 6074
2927                    }
2928                }
2929
2930                if (success) {
2931                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2932                } else {
2933                    #ifdef REGEX_SMART_BACKTRACKING
2934                    if (fp->fInputIdx > backSearchIndex && fStack->size()) {
2935                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2936                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2937                            // Reset to last start point
2938                            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2939                            patternChars = litText+stringStartIdx;
2940
2941                            // Search backwards for a possible start
2942                            do {
2943                                c = UTEXT_PREVIOUS32(fInputText);
2944                                if (c == U_SENTINEL) {
2945                                    break;
2946                                } else if ((U_IS_BMP(c) && *patternChars == c) ||
2947                                    (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
2948                                    success = TRUE;
2949                                    break;
2950                                }
2951                            } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
2952
2953                            // And try again
2954                            if (success) {
2955                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2956                                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2957                                if (fp->fInputIdx > backSearchIndex) {
2958                                    fp = StateSave(fp, fp->fPatIdx, status);
2959                                }
2960                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
2961                                break;
2962                            }
2963                        }
2964                    }
2965                    #endif
2966                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2967                }
2968            }
2969            break;
2970
2971
2972        case URX_STATE_SAVE:
2973            fp = StateSave(fp, opValue, status);
2974            break;
2975
2976
2977        case URX_END:
2978            // The match loop will exit via this path on a successful match,
2979            //   when we reach the end of the pattern.
2980            if (toEnd && fp->fInputIdx != fActiveLimit) {
2981                // The pattern matched, but not to the end of input.  Try some more.
2982                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2983                break;
2984            }
2985            isMatch = TRUE;
2986            goto  breakFromLoop;
2987
2988        // Start and End Capture stack frame variables are laid out out like this:
2989            //  fp->fExtra[opValue]  - The start of a completed capture group
2990            //             opValue+1 - The end   of a completed capture group
2991            //             opValue+2 - the start of a capture group whose end
2992            //                          has not yet been reached (and might not ever be).
2993        case URX_START_CAPTURE:
2994            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2995            fp->fExtra[opValue+2] = fp->fInputIdx;
2996            break;
2997
2998
2999        case URX_END_CAPTURE:
3000            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
3001            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
3002            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
3003            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
3004            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
3005            break;
3006
3007
3008        case URX_DOLLAR:                   //  $, test for End of line
3009                                           //     or for position before new line at end of input
3010            {
3011                if (fp->fInputIdx >= fAnchorLimit) {
3012                    // We really are at the end of input.  Success.
3013                    fHitEnd = TRUE;
3014                    fRequireEnd = TRUE;
3015                    break;
3016                }
3017
3018                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3019
3020                // If we are positioned just before a new-line that is located at the
3021                //   end of input, succeed.
3022                UChar32 c = UTEXT_NEXT32(fInputText);
3023                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3024                    if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
3025                        // If not in the middle of a CR/LF sequence
3026                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
3027                            // At new-line at end of input. Success
3028                            fHitEnd = TRUE;
3029                            fRequireEnd = TRUE;
3030
3031                            break;
3032                        }
3033                    }
3034                } else {
3035                    UChar32 nextC = UTEXT_NEXT32(fInputText);
3036                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3037                        fHitEnd = TRUE;
3038                        fRequireEnd = TRUE;
3039                        break;                         // At CR/LF at end of input.  Success
3040                    }
3041                }
3042
3043                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3044            }
3045            break;
3046
3047
3048         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
3049            if (fp->fInputIdx >= fAnchorLimit) {
3050                // Off the end of input.  Success.
3051                fHitEnd = TRUE;
3052                fRequireEnd = TRUE;
3053                break;
3054            } else {
3055                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3056                UChar32 c = UTEXT_NEXT32(fInputText);
3057                // Either at the last character of input, or off the end.
3058                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3059                    fHitEnd = TRUE;
3060                    fRequireEnd = TRUE;
3061                    break;
3062                }
3063            }
3064
3065            // Not at end of input.  Back-track out.
3066            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3067            break;
3068
3069
3070         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3071             {
3072                 if (fp->fInputIdx >= fAnchorLimit) {
3073                     // We really are at the end of input.  Success.
3074                     fHitEnd = TRUE;
3075                     fRequireEnd = TRUE;
3076                     break;
3077                 }
3078                 // If we are positioned just before a new-line, succeed.
3079                 // It makes no difference where the new-line is within the input.
3080                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3081                 UChar32 c = UTEXT_CURRENT32(fInputText);
3082                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3083                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3084                     //  In multi-line mode, hitting a new-line just before the end of input does not
3085                     //   set the hitEnd or requireEnd flags
3086                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3087                        break;
3088                     }
3089                 }
3090                 // not at a new line.  Fail.
3091                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3092             }
3093             break;
3094
3095
3096         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3097             {
3098                 if (fp->fInputIdx >= fAnchorLimit) {
3099                     // We really are at the end of input.  Success.
3100                     fHitEnd = TRUE;
3101                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3102                     break;               //   adding a new-line would not lose the match.
3103                 }
3104                 // If we are not positioned just before a new-line, the test fails; backtrack out.
3105                 // It makes no difference where the new-line is within the input.
3106                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3107                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3108                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3109                 }
3110             }
3111             break;
3112
3113
3114       case URX_CARET:                    //  ^, test for start of line
3115            if (fp->fInputIdx != fAnchorStart) {
3116                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3117            }
3118            break;
3119
3120
3121       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3122           {
3123               if (fp->fInputIdx == fAnchorStart) {
3124                   // We are at the start input.  Success.
3125                   break;
3126               }
3127               // Check whether character just before the current pos is a new-line
3128               //   unless we are at the end of input
3129               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3130               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3131               if ((fp->fInputIdx < fAnchorLimit) &&
3132                   ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3133                   //  It's a new-line.  ^ is true.  Success.
3134                   //  TODO:  what should be done with positions between a CR and LF?
3135                   break;
3136               }
3137               // Not at the start of a line.  Fail.
3138               fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139           }
3140           break;
3141
3142
3143       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3144           {
3145               U_ASSERT(fp->fInputIdx >= fAnchorStart);
3146               if (fp->fInputIdx <= fAnchorStart) {
3147                   // We are at the start input.  Success.
3148                   break;
3149               }
3150               // Check whether character just before the current pos is a new-line
3151               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3152               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3153               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3154               if (c != 0x0a) {
3155                   // Not at the start of a line.  Back-track out.
3156                   fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3157               }
3158           }
3159           break;
3160
3161        case URX_BACKSLASH_B:          // Test for word boundaries
3162            {
3163                UBool success = isWordBoundary(fp->fInputIdx);
3164                success ^= (opValue != 0);     // flip sense for \B
3165                if (!success) {
3166                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3167                }
3168            }
3169            break;
3170
3171
3172        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3173            {
3174                UBool success = isUWordBoundary(fp->fInputIdx);
3175                success ^= (opValue != 0);     // flip sense for \B
3176                if (!success) {
3177                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3178                }
3179            }
3180            break;
3181
3182
3183        case URX_BACKSLASH_D:            // Test for decimal digit
3184            {
3185                if (fp->fInputIdx >= fActiveLimit) {
3186                    fHitEnd = TRUE;
3187                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3188                    break;
3189                }
3190
3191                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3192
3193                UChar32 c = UTEXT_NEXT32(fInputText);
3194                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3195                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3196                success ^= (opValue != 0);        // flip sense for \D
3197                if (success) {
3198                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199                } else {
3200                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201                }
3202            }
3203            break;
3204
3205
3206        case URX_BACKSLASH_G:          // Test for position at end of previous match
3207            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3208                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3209            }
3210            break;
3211
3212
3213        case URX_BACKSLASH_X:
3214            //  Match a Grapheme, as defined by Unicode TR 29.
3215            //  Differs slightly from Perl, which consumes combining marks independently
3216            //    of context.
3217            {
3218
3219                // Fail if at end of input
3220                if (fp->fInputIdx >= fActiveLimit) {
3221                    fHitEnd = TRUE;
3222                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3223                    break;
3224                }
3225
3226                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3227
3228                // Examine (and consume) the current char.
3229                //   Dispatch into a little state machine, based on the char.
3230                UChar32  c;
3231                c = UTEXT_NEXT32(fInputText);
3232                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3233                UnicodeSet **sets = fPattern->fStaticSets;
3234                if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3235                if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3236                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3237                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3238                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3239                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3240                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3241                goto GC_Extend;
3242
3243
3244
3245GC_L:
3246                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3247                c = UTEXT_NEXT32(fInputText);
3248                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3249                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3250                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3251                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3252                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3253                (void)UTEXT_PREVIOUS32(fInputText);
3254                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3255                goto GC_Extend;
3256
3257GC_V:
3258                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3259                c = UTEXT_NEXT32(fInputText);
3260                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3261                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3262                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3263                (void)UTEXT_PREVIOUS32(fInputText);
3264                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3265                goto GC_Extend;
3266
3267GC_T:
3268                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3269                c = UTEXT_NEXT32(fInputText);
3270                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3271                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3272                (void)UTEXT_PREVIOUS32(fInputText);
3273                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3274                goto GC_Extend;
3275
3276GC_Extend:
3277                // Combining characters are consumed here
3278                for (;;) {
3279                    if (fp->fInputIdx >= fActiveLimit) {
3280                        break;
3281                    }
3282                    c = UTEXT_CURRENT32(fInputText);
3283                    if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3284                        break;
3285                    }
3286                    (void)UTEXT_NEXT32(fInputText);
3287                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3288                }
3289                goto GC_Done;
3290
3291GC_Control:
3292                // Most control chars stand alone (don't combine with combining chars),
3293                //   except for that CR/LF sequence is a single grapheme cluster.
3294                if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3295                    c = UTEXT_NEXT32(fInputText);
3296                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3297                }
3298
3299GC_Done:
3300                if (fp->fInputIdx >= fActiveLimit) {
3301                    fHitEnd = TRUE;
3302                }
3303                break;
3304            }
3305
3306
3307
3308
3309        case URX_BACKSLASH_Z:          // Test for end of Input
3310            if (fp->fInputIdx < fAnchorLimit) {
3311                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3312            } else {
3313                fHitEnd = TRUE;
3314                fRequireEnd = TRUE;
3315            }
3316            break;
3317
3318
3319
3320        case URX_STATIC_SETREF:
3321            {
3322                // Test input character against one of the predefined sets
3323                //    (Word Characters, for example)
3324                // The high bit of the op value is a flag for the match polarity.
3325                //    0:   success if input char is in set.
3326                //    1:   success if input char is not in set.
3327                if (fp->fInputIdx >= fActiveLimit) {
3328                    fHitEnd = TRUE;
3329                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3330                    break;
3331                }
3332
3333                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3334                opValue &= ~URX_NEG_SET;
3335                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3336
3337                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3338                UChar32 c = UTEXT_NEXT32(fInputText);
3339                if (c < 256) {
3340                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3341                    if (s8->contains(c)) {
3342                        success = !success;
3343                    }
3344                } else {
3345                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3346                    if (s->contains(c)) {
3347                        success = !success;
3348                    }
3349                }
3350                if (success) {
3351                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3352                } else {
3353                    // the character wasn't in the set.
3354                    #ifdef REGEX_SMART_BACKTRACKING
3355                    if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3356                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3357                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3358                            // Try to find it, backwards
3359                            UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3360                            success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
3361                            do {
3362                                c = UTEXT_PREVIOUS32(fInputText);
3363                                if (c == U_SENTINEL) {
3364                                    break;
3365                                } else if (c < 256) {
3366                                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3367                                    if (s8->contains(c)) {
3368                                        success = !success;
3369                                    }
3370                                } else {
3371                                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3372                                    if (s->contains(c)) {
3373                                        success = !success;
3374                                    }
3375                                }
3376                            } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
3377
3378                            if (success && c != U_SENTINEL) {
3379                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3380                                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3381                                if (fp->fInputIdx > backSearchIndex) {
3382                                    fp = StateSave(fp, fp->fPatIdx, status);
3383                                }
3384                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
3385                                break;
3386                            }
3387                        }
3388                    }
3389                    #endif
3390                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3391                }
3392            }
3393            break;
3394
3395
3396        case URX_STAT_SETREF_N:
3397            {
3398                // Test input character for NOT being a member of  one of
3399                //    the predefined sets (Word Characters, for example)
3400                if (fp->fInputIdx >= fActiveLimit) {
3401                    fHitEnd = TRUE;
3402                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3403                    break;
3404                }
3405
3406                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3407
3408                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3409
3410                UChar32 c = UTEXT_NEXT32(fInputText);
3411                if (c < 256) {
3412                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3413                    if (s8->contains(c) == FALSE) {
3414                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3415                        break;
3416                    }
3417                } else {
3418                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3419                    if (s->contains(c) == FALSE) {
3420                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3421                        break;
3422                    }
3423                }
3424                // the character wasn't in the set.
3425                #ifdef REGEX_SMART_BACKTRACKING
3426                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3427                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3428                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3429                        // Try to find it, backwards
3430                        UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3431                        UBool success = FALSE;
3432                        do {
3433                            c = UTEXT_PREVIOUS32(fInputText);
3434                            if (c == U_SENTINEL) {
3435                                break;
3436                            } else if (c < 256) {
3437                                Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3438                                if (s8->contains(c) == FALSE) {
3439                                    success = TRUE;
3440                                    break;
3441                                }
3442                            } else {
3443                                const UnicodeSet *s = fPattern->fStaticSets[opValue];
3444                                if (s->contains(c) == FALSE) {
3445                                    success = TRUE;
3446                                    break;
3447                                }
3448                            }
3449                        } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3450
3451                        if (success) {
3452                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3453                            fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3454                            if (fp->fInputIdx > backSearchIndex) {
3455                                fp = StateSave(fp, fp->fPatIdx, status);
3456                            }
3457                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
3458                            break;
3459                        }
3460                    }
3461                }
3462                #endif
3463                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3464            }
3465            break;
3466
3467
3468        case URX_SETREF:
3469            if (fp->fInputIdx >= fActiveLimit) {
3470                fHitEnd = TRUE;
3471                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3472                break;
3473            } else {
3474                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3475
3476                // There is input left.  Pick up one char and test it for set membership.
3477                UChar32 c = UTEXT_NEXT32(fInputText);
3478                U_ASSERT(opValue > 0 && opValue < sets->size());
3479                if (c<256) {
3480                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3481                    if (s8->contains(c)) {
3482                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3483                        break;
3484                    }
3485                } else {
3486                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3487                    if (s->contains(c)) {
3488                        // The character is in the set.  A Match.
3489                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3490                        break;
3491                    }
3492                }
3493
3494                // the character wasn't in the set.
3495                #ifdef REGEX_SMART_BACKTRACKING
3496                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3497                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3498                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3499                        // Try to find it, backwards
3500                        UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3501                        UBool success = FALSE;
3502                        do {
3503                            c = UTEXT_PREVIOUS32(fInputText);
3504                            if (c == U_SENTINEL) {
3505                                break;
3506                            } else if (c < 256) {
3507                                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3508                                if (s8->contains(c)) {
3509                                    success = TRUE;
3510                                    break;
3511                                }
3512                            } else {
3513                                UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3514                                if (s->contains(c)) {
3515                                    success = TRUE;
3516                                    break;
3517                                }
3518                            }
3519                        } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3520
3521                        if (success) {
3522                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3523                            fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3524                            if (fp->fInputIdx > backSearchIndex) {
3525                                fp = StateSave(fp, fp->fPatIdx, status);
3526                            }
3527                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
3528                            break;
3529                        }
3530                    }
3531                }
3532                #endif
3533                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3534            }
3535            break;
3536
3537
3538        case URX_DOTANY:
3539            {
3540                // . matches anything, but stops at end-of-line.
3541                if (fp->fInputIdx >= fActiveLimit) {
3542                    // At end of input.  Match failed.  Backtrack out.
3543                    fHitEnd = TRUE;
3544                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3545                    break;
3546                }
3547
3548                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3549
3550                // There is input left.  Advance over one char, unless we've hit end-of-line
3551                UChar32 c = UTEXT_NEXT32(fInputText);
3552                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
3553                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3554                    // End of line in normal mode.   . does not match.
3555                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3556                    break;
3557                }
3558                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3559            }
3560            break;
3561
3562
3563        case URX_DOTANY_ALL:
3564            {
3565                // ., in dot-matches-all (including new lines) mode
3566                if (fp->fInputIdx >= fActiveLimit) {
3567                    // At end of input.  Match failed.  Backtrack out.
3568                    fHitEnd = TRUE;
3569                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3570                    break;
3571                }
3572
3573                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3574
3575                // There is input left.  Advance over one char, except if we are
3576                //   at a cr/lf, advance over both of them.
3577                UChar32 c;
3578                c = UTEXT_NEXT32(fInputText);
3579                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3580                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3581                    // In the case of a CR/LF, we need to advance over both.
3582                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
3583                    if (nextc == 0x0a) {
3584                        (void)UTEXT_NEXT32(fInputText);
3585                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3586                    }
3587                }
3588            }
3589            break;
3590
3591
3592        case URX_DOTANY_UNIX:
3593            {
3594                // '.' operator, matches all, but stops at end-of-line.
3595                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3596                if (fp->fInputIdx >= fActiveLimit) {
3597                    // At end of input.  Match failed.  Backtrack out.
3598                    fHitEnd = TRUE;
3599                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3600                    break;
3601                }
3602
3603                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3604
3605                // There is input left.  Advance over one char, unless we've hit end-of-line
3606                UChar32 c = UTEXT_NEXT32(fInputText);
3607                if (c == 0x0a) {
3608                    // End of line in normal mode.   '.' does not match the \n
3609                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3610                } else {
3611                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3612                }
3613            }
3614            break;
3615
3616
3617        case URX_JMP:
3618            fp->fPatIdx = opValue;
3619            break;
3620
3621        case URX_FAIL:
3622            isMatch = FALSE;
3623            goto breakFromLoop;
3624
3625        case URX_JMP_SAV:
3626            U_ASSERT(opValue < fPattern->fCompiledPat->size());
3627            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3628            fp->fPatIdx = opValue;                         // Then JMP.
3629            break;
3630
3631        case URX_JMP_SAV_X:
3632            // This opcode is used with (x)+, when x can match a zero length string.
3633            // Same as JMP_SAV, except conditional on the match having made forward progress.
3634            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3635            //   data address of the input position at the start of the loop.
3636            {
3637                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3638                int32_t  stoOp = (int32_t)pat[opValue-1];
3639                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3640                int32_t  frameLoc = URX_VAL(stoOp);
3641                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3642                int64_t prevInputIdx = fp->fExtra[frameLoc];
3643                U_ASSERT(prevInputIdx <= fp->fInputIdx);
3644                if (prevInputIdx < fp->fInputIdx) {
3645                    // The match did make progress.  Repeat the loop.
3646                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3647                    fp->fPatIdx = opValue;
3648                    fp->fExtra[frameLoc] = fp->fInputIdx;
3649                }
3650                // If the input position did not advance, we do nothing here,
3651                //   execution will fall out of the loop.
3652            }
3653            break;
3654
3655        case URX_CTR_INIT:
3656            {
3657                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3658                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3659
3660                // Pick up the three extra operands that CTR_INIT has, and
3661                //    skip the pattern location counter past
3662                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3663                fp->fPatIdx += 3;
3664                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3665                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3666                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3667                U_ASSERT(minCount>=0);
3668                U_ASSERT(maxCount>=minCount || maxCount==-1);
3669                U_ASSERT(loopLoc>fp->fPatIdx);
3670
3671                if (minCount == 0) {
3672                    fp = StateSave(fp, loopLoc+1, status);
3673                }
3674                if (maxCount == 0) {
3675                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3676                }
3677            }
3678            break;
3679
3680        case URX_CTR_LOOP:
3681            {
3682                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3683                int32_t initOp = (int32_t)pat[opValue];
3684                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3685                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3686                int32_t minCount  = (int32_t)pat[opValue+2];
3687                int32_t maxCount  = (int32_t)pat[opValue+3];
3688                // Increment the counter.  Note: we DIDN'T worry about counter
3689                //   overflow, since the data comes from UnicodeStrings, which
3690                //   stores its length in an int32_t. Do we have to think about
3691                //   this now that we're using UText? Probably not, since the length
3692                //   in UChar32s is still an int32_t.
3693                (*pCounter)++;
3694                U_ASSERT(*pCounter > 0);
3695                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3696                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
3697                    break;
3698                }
3699                if (*pCounter >= minCount) {
3700                    fp = StateSave(fp, fp->fPatIdx, status);
3701                }
3702                fp->fPatIdx = opValue + 4;    // Loop back.
3703            }
3704            break;
3705
3706        case URX_CTR_INIT_NG:
3707            {
3708                // Initialize a non-greedy loop
3709                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3710                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3711
3712                // Pick up the three extra operands that CTR_INIT has, and
3713                //    skip the pattern location counter past
3714                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3715                fp->fPatIdx += 3;
3716                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3717                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3718                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3719                U_ASSERT(minCount>=0);
3720                U_ASSERT(maxCount>=minCount || maxCount==-1);
3721                U_ASSERT(loopLoc>fp->fPatIdx);
3722
3723                if (minCount == 0) {
3724                    if (maxCount != 0) {
3725                        fp = StateSave(fp, fp->fPatIdx, status);
3726                    }
3727                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3728                }
3729            }
3730            break;
3731
3732        case URX_CTR_LOOP_NG:
3733            {
3734                // Non-greedy {min, max} loops
3735                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3736                int32_t initOp = (int32_t)pat[opValue];
3737                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3738                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3739                int32_t minCount  = (int32_t)pat[opValue+2];
3740                int32_t maxCount  = (int32_t)pat[opValue+3];
3741                // Increment the counter.  Note: we DIDN'T worry about counter
3742                //   overflow, since the data comes from UnicodeStrings, which
3743                //   stores its length in an int32_t. Do we have to think about
3744                //   this now that we're using UText? Probably not, since the length
3745                //   in UChar32s is still an int32_t.
3746                (*pCounter)++;
3747                U_ASSERT(*pCounter > 0);
3748
3749                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3750                    // The loop has matched the maximum permitted number of times.
3751                    //   Break out of here with no action.  Matching will
3752                    //   continue with the following pattern.
3753                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
3754                    break;
3755                }
3756
3757                if (*pCounter < minCount) {
3758                    // We haven't met the minimum number of matches yet.
3759                    //   Loop back for another one.
3760                    fp->fPatIdx = opValue + 4;    // Loop back.
3761                } else {
3762                    // We do have the minimum number of matches.
3763                    //   Fall into the following pattern, but first do
3764                    //   a state save to the top of the loop, so that a failure
3765                    //   in the following pattern will try another iteration of the loop.
3766                    fp = StateSave(fp, opValue + 4, status);
3767                }
3768            }
3769            break;
3770
3771        case URX_STO_SP:
3772            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3773            fData[opValue] = fStack->size();
3774            break;
3775
3776        case URX_LD_SP:
3777            {
3778                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3779                int32_t newStackSize = (int32_t)fData[opValue];
3780                U_ASSERT(newStackSize <= fStack->size());
3781                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3782                if (newFP == (int64_t *)fp) {
3783                    break;
3784                }
3785                int32_t i;
3786                for (i=0; i<fFrameSize; i++) {
3787                    newFP[i] = ((int64_t *)fp)[i];
3788                }
3789                fp = (REStackFrame *)newFP;
3790                fStack->setSize(newStackSize);
3791            }
3792            break;
3793
3794        case URX_BACKREF:
3795        case URX_BACKREF_I:
3796            {
3797                U_ASSERT(opValue < fFrameSize);
3798                int64_t groupStartIdx = fp->fExtra[opValue];
3799                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3800                U_ASSERT(groupStartIdx <= groupEndIdx);
3801                if (groupStartIdx < 0) {
3802                    // This capture group has not participated in the match thus far,
3803                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3804                }
3805
3806                if (groupEndIdx == groupStartIdx) {
3807                    //   The capture group match was of an empty string.
3808                    //   Verified by testing:  Perl matches succeed in this case, so
3809                    //   we do too.
3810                    break;
3811                }
3812
3813                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3814                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3815
3816                UBool haveMatch = (opType == URX_BACKREF ?
3817                    (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
3818                    (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
3819                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3820
3821                if (fp->fInputIdx > fActiveLimit) {
3822                    fHitEnd = TRUE;
3823                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3824                } else if (!haveMatch) {
3825                    if (fp->fInputIdx == fActiveLimit) {
3826                        fHitEnd = TRUE;
3827                    }
3828                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3829                }
3830            }
3831            break;
3832
3833        case URX_STO_INP_LOC:
3834            {
3835                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3836                fp->fExtra[opValue] = fp->fInputIdx;
3837            }
3838            break;
3839
3840        case URX_JMPX:
3841            {
3842                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3843                fp->fPatIdx += 1;
3844                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3845                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3846                int64_t savedInputIdx = fp->fExtra[dataLoc];
3847                U_ASSERT(savedInputIdx <= fp->fInputIdx);
3848                if (savedInputIdx < fp->fInputIdx) {
3849                    fp->fPatIdx = opValue;                               // JMP
3850                } else {
3851                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3852                }
3853            }
3854            break;
3855
3856        case URX_LA_START:
3857            {
3858                // Entering a lookahead block.
3859                // Save Stack Ptr, Input Pos.
3860                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3861                fData[opValue]   = fStack->size();
3862                fData[opValue+1] = fp->fInputIdx;
3863                fActiveStart     = fLookStart;          // Set the match region change for
3864                fActiveLimit     = fLookLimit;          //   transparent bounds.
3865            }
3866            break;
3867
3868        case URX_LA_END:
3869            {
3870                // Leaving a look-ahead block.
3871                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3872                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3873                int32_t stackSize = fStack->size();
3874                int32_t newStackSize =(int32_t)fData[opValue];
3875                U_ASSERT(stackSize >= newStackSize);
3876                if (stackSize > newStackSize) {
3877                    // Copy the current top frame back to the new (cut back) top frame.
3878                    //   This makes the capture groups from within the look-ahead
3879                    //   expression available.
3880                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3881                    int32_t i;
3882                    for (i=0; i<fFrameSize; i++) {
3883                        newFP[i] = ((int64_t *)fp)[i];
3884                    }
3885                    fp = (REStackFrame *)newFP;
3886                    fStack->setSize(newStackSize);
3887                }
3888                fp->fInputIdx = fData[opValue+1];
3889
3890                // Restore the active region bounds in the input string; they may have
3891                //    been changed because of transparent bounds on a Region.
3892                fActiveStart = fRegionStart;
3893                fActiveLimit = fRegionLimit;
3894            }
3895            break;
3896
3897        case URX_ONECHAR_I:
3898            if (fp->fInputIdx < fActiveLimit) {
3899                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3900
3901                UChar32 c = UTEXT_NEXT32(fInputText);
3902                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3903                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3904                    break;
3905                }
3906            } else {
3907                fHitEnd = TRUE;
3908            }
3909
3910            #ifdef REGEX_SMART_BACKTRACKING
3911            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3912                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3913                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3914                    UBool success = FALSE;
3915                    UChar32 c = UTEXT_PREVIOUS32(fInputText);
3916                    while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
3917                        if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3918                            success = TRUE;
3919                            break;
3920                        } else if (c == U_SENTINEL) {
3921                            break;
3922                        }
3923                        c = UTEXT_PREVIOUS32(fInputText);
3924                    }
3925                    if (success) {
3926                        fHitEnd = FALSE;
3927                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3928                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3929                        if (fp->fInputIdx > backSearchIndex) {
3930                            fp = StateSave(fp, fp->fPatIdx, status);
3931                        }
3932                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
3933                        break;
3934                    }
3935                }
3936            }
3937            #endif
3938
3939            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3940            break;
3941
3942        case URX_STRING_I:
3943            {
3944                // Test input against a literal string.
3945                // Strings require two slots in the compiled pattern, one for the
3946                //   offset to the string text, and one for the length.
3947                const UCaseProps *csp = ucase_getSingleton();
3948                {
3949                    int32_t stringStartIdx, stringLen;
3950                    stringStartIdx = opValue;
3951
3952                    op      = (int32_t)pat[fp->fPatIdx];
3953                    fp->fPatIdx++;
3954                    opType  = URX_TYPE(op);
3955                    opValue = URX_VAL(op);
3956                    U_ASSERT(opType == URX_STRING_LEN);
3957                    stringLen = opValue;
3958
3959                    const UChar *patternChars = litText+stringStartIdx;
3960                    const UChar *patternEnd = patternChars+stringLen;
3961
3962                    const UChar *foldChars = NULL;
3963                    int32_t foldOffset, foldLength;
3964                    UChar32 c;
3965
3966                    foldOffset = foldLength = 0;
3967                    UBool success = TRUE;
3968
3969                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3970                    while (patternChars < patternEnd && success) {
3971                        if(foldOffset < foldLength) {
3972                            U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3973                        } else {
3974                            c = UTEXT_NEXT32(fInputText);
3975                            if (c != U_SENTINEL) {
3976                                foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3977                                if(foldLength >= 0) {
3978                                    if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
3979                                        foldOffset = 0;
3980                                        U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3981                                    } else {
3982                                        c = foldLength;
3983                                        foldLength = foldOffset; // to avoid reading chars from the folding buffer
3984                                    }
3985                                }
3986                            }
3987
3988                            fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3989                        }
3990
3991                        success = FALSE;
3992                        if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
3993                            if (U_IS_BMP(c)) {
3994                                success = (*patternChars == c);
3995                                patternChars += 1;
3996                            } else if (patternChars+1 < patternEnd) {
3997                                success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
3998                                patternChars += 2;
3999                            }
4000                        } else {
4001                            fHitEnd = TRUE;          //   TODO:  See ticket 6074
4002                        }
4003                    }
4004
4005                    if (!success) {
4006                        #ifdef REGEX_SMART_BACKTRACKING
4007                        if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4008                            REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4009                            if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4010                                // Reset to last start point
4011                                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4012                                patternChars = litText+stringStartIdx;
4013
4014                                // Search backwards for a possible start
4015                                do {
4016                                    c = UTEXT_PREVIOUS32(fInputText);
4017                                    if (c == U_SENTINEL) {
4018                                        break;
4019                                    } else {
4020                                        foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
4021                                        if(foldLength >= 0) {
4022                                            if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
4023                                                foldOffset = 0;
4024                                                U16_NEXT_UNSAFE(foldChars, foldOffset, c);
4025                                            } else {
4026                                                c = foldLength;
4027                                                foldLength = foldOffset; // to avoid reading chars from the folding buffer
4028                                            }
4029                                        }
4030
4031                                        if ((U_IS_BMP(c) && *patternChars == c) ||
4032                                               (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
4033                                            success = TRUE;
4034                                            break;
4035                                        }
4036                                    }
4037                                } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
4038
4039                                // And try again
4040                                if (success) {
4041                                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4042                                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4043                                    if (fp->fInputIdx > backSearchIndex) {
4044                                        fp = StateSave(fp, fp->fPatIdx, status);
4045                                    }
4046                                    fp->fPatIdx++; // Skip the LOOP_C, we just did that
4047                                    break;
4048                                }
4049                            }
4050                        }
4051                        #endif
4052                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4053                    }
4054                }
4055            }
4056            break;
4057
4058        case URX_LB_START:
4059            {
4060                // Entering a look-behind block.
4061                // Save Stack Ptr, Input Pos.
4062                //   TODO:  implement transparent bounds.  Ticket #6067
4063                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4064                fData[opValue]   = fStack->size();
4065                fData[opValue+1] = fp->fInputIdx;
4066                // Init the variable containing the start index for attempted matches.
4067                fData[opValue+2] = -1;
4068                // Save input string length, then reset to pin any matches to end at
4069                //   the current position.
4070                fData[opValue+3] = fActiveLimit;
4071                fActiveLimit     = fp->fInputIdx;
4072            }
4073            break;
4074
4075
4076        case URX_LB_CONT:
4077            {
4078                // Positive Look-Behind, at top of loop checking for matches of LB expression
4079                //    at all possible input starting positions.
4080
4081                // Fetch the min and max possible match lengths.  They are the operands
4082                //   of this op in the pattern.
4083                int32_t minML = (int32_t)pat[fp->fPatIdx++];
4084                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4085                U_ASSERT(minML <= maxML);
4086                U_ASSERT(minML >= 0);
4087
4088                // Fetch (from data) the last input index where a match was attempted.
4089                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4090                int64_t  *lbStartIdx = &fData[opValue+2];
4091                if (*lbStartIdx < 0) {
4092                    // First time through loop.
4093                    *lbStartIdx = fp->fInputIdx - minML;
4094                } else {
4095                    // 2nd through nth time through the loop.
4096                    // Back up start position for match by one.
4097                    if (*lbStartIdx == 0) {
4098                        (*lbStartIdx)--;
4099                    } else {
4100                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4101                        (void)UTEXT_PREVIOUS32(fInputText);
4102                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4103                    }
4104                }
4105
4106                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4107                    // We have tried all potential match starting points without
4108                    //  getting a match.  Backtrack out, and out of the
4109                    //   Look Behind altogether.
4110                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4111                    int64_t restoreInputLen = fData[opValue+3];
4112                    U_ASSERT(restoreInputLen >= fActiveLimit);
4113                    U_ASSERT(restoreInputLen <= fInputLength);
4114                    fActiveLimit = restoreInputLen;
4115                    break;
4116                }
4117
4118                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4119                //      (successful match will fall off the end of the loop.)
4120                fp = StateSave(fp, fp->fPatIdx-3, status);
4121                fp->fInputIdx = *lbStartIdx;
4122            }
4123            break;
4124
4125        case URX_LB_END:
4126            // End of a look-behind block, after a successful match.
4127            {
4128                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4129                if (fp->fInputIdx != fActiveLimit) {
4130                    //  The look-behind expression matched, but the match did not
4131                    //    extend all the way to the point that we are looking behind from.
4132                    //  FAIL out of here, which will take us back to the LB_CONT, which
4133                    //     will retry the match starting at another position or fail
4134                    //     the look-behind altogether, whichever is appropriate.
4135                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4136                    break;
4137                }
4138
4139                // Look-behind match is good.  Restore the orignal input string length,
4140                //   which had been truncated to pin the end of the lookbehind match to the
4141                //   position being looked-behind.
4142                int64_t originalInputLen = fData[opValue+3];
4143                U_ASSERT(originalInputLen >= fActiveLimit);
4144                U_ASSERT(originalInputLen <= fInputLength);
4145                fActiveLimit = originalInputLen;
4146            }
4147            break;
4148
4149
4150        case URX_LBN_CONT:
4151            {
4152                // Negative Look-Behind, at top of loop checking for matches of LB expression
4153                //    at all possible input starting positions.
4154
4155                // Fetch the extra parameters of this op.
4156                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4157                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4158                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4159                        continueLoc = URX_VAL(continueLoc);
4160                U_ASSERT(minML <= maxML);
4161                U_ASSERT(minML >= 0);
4162                U_ASSERT(continueLoc > fp->fPatIdx);
4163
4164                // Fetch (from data) the last input index where a match was attempted.
4165                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4166                int64_t  *lbStartIdx = &fData[opValue+2];
4167                if (*lbStartIdx < 0) {
4168                    // First time through loop.
4169                    *lbStartIdx = fp->fInputIdx - minML;
4170                } else {
4171                    // 2nd through nth time through the loop.
4172                    // Back up start position for match by one.
4173                    if (*lbStartIdx == 0) {
4174                        (*lbStartIdx)--;
4175                    } else {
4176                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4177                        (void)UTEXT_PREVIOUS32(fInputText);
4178                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4179                    }
4180                }
4181
4182                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4183                    // We have tried all potential match starting points without
4184                    //  getting a match, which means that the negative lookbehind as
4185                    //  a whole has succeeded.  Jump forward to the continue location
4186                    int64_t restoreInputLen = fData[opValue+3];
4187                    U_ASSERT(restoreInputLen >= fActiveLimit);
4188                    U_ASSERT(restoreInputLen <= fInputLength);
4189                    fActiveLimit = restoreInputLen;
4190                    fp->fPatIdx = continueLoc;
4191                    break;
4192                }
4193
4194                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4195                //      (successful match will cause a FAIL out of the loop altogether.)
4196                fp = StateSave(fp, fp->fPatIdx-4, status);
4197                fp->fInputIdx = *lbStartIdx;
4198            }
4199            break;
4200
4201        case URX_LBN_END:
4202            // End of a negative look-behind block, after a successful match.
4203            {
4204                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4205                if (fp->fInputIdx != fActiveLimit) {
4206                    //  The look-behind expression matched, but the match did not
4207                    //    extend all the way to the point that we are looking behind from.
4208                    //  FAIL out of here, which will take us back to the LB_CONT, which
4209                    //     will retry the match starting at another position or succeed
4210                    //     the look-behind altogether, whichever is appropriate.
4211                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4212                    break;
4213                }
4214
4215                // Look-behind expression matched, which means look-behind test as
4216                //   a whole Fails
4217
4218                //   Restore the orignal input string length, which had been truncated
4219                //   inorder to pin the end of the lookbehind match
4220                //   to the position being looked-behind.
4221                int64_t originalInputLen = fData[opValue+3];
4222                U_ASSERT(originalInputLen >= fActiveLimit);
4223                U_ASSERT(originalInputLen <= fInputLength);
4224                fActiveLimit = originalInputLen;
4225
4226                // Restore original stack position, discarding any state saved
4227                //   by the successful pattern match.
4228                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4229                int32_t newStackSize = (int32_t)fData[opValue];
4230                U_ASSERT(fStack->size() > newStackSize);
4231                fStack->setSize(newStackSize);
4232
4233                //  FAIL, which will take control back to someplace
4234                //  prior to entering the look-behind test.
4235                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4236            }
4237            break;
4238
4239
4240        case URX_LOOP_SR_I:
4241            // Loop Initialization for the optimized implementation of
4242            //     [some character set]*
4243            //   This op scans through all matching input.
4244            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4245            {
4246                U_ASSERT(opValue > 0 && opValue < sets->size());
4247                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4248                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4249
4250                // Loop through input, until either the input is exhausted or
4251                //   we reach a character that is not a member of the set.
4252                int64_t ix = fp->fInputIdx;
4253                UTEXT_SETNATIVEINDEX(fInputText, ix);
4254                for (;;) {
4255                    if (ix >= fActiveLimit) {
4256                        fHitEnd = TRUE;
4257                        break;
4258                    }
4259                    UChar32 c = UTEXT_NEXT32(fInputText);
4260                    if (c<256) {
4261                        if (s8->contains(c) == FALSE) {
4262                            break;
4263                        }
4264                    } else {
4265                        if (s->contains(c) == FALSE) {
4266                            break;
4267                        }
4268                    }
4269                    ix = UTEXT_GETNATIVEINDEX(fInputText);
4270                }
4271
4272                // If there were no matching characters, skip over the loop altogether.
4273                //   The loop doesn't run at all, a * op always succeeds.
4274                if (ix == fp->fInputIdx) {
4275                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4276                    break;
4277                }
4278
4279                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4280                //   must follow.  It's operand is the stack location
4281                //   that holds the starting input index for the match of this [set]*
4282                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4283                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4284                int32_t stackLoc = URX_VAL(loopcOp);
4285                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4286                fp->fExtra[stackLoc] = fp->fInputIdx;
4287                #ifdef REGEX_SMART_BACKTRACKING
4288                backSearchIndex = fp->fInputIdx;
4289                #endif
4290                fp->fInputIdx = ix;
4291
4292                // Save State to the URX_LOOP_C op that follows this one,
4293                //   so that match failures in the following code will return to there.
4294                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4295                fp = StateSave(fp, fp->fPatIdx, status);
4296                fp->fPatIdx++;
4297            }
4298            break;
4299
4300
4301        case URX_LOOP_DOT_I:
4302            // Loop Initialization for the optimized implementation of .*
4303            //   This op scans through all remaining input.
4304            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4305            {
4306                // Loop through input until the input is exhausted (we reach an end-of-line)
4307                // In DOTALL mode, we can just go straight to the end of the input.
4308                int64_t ix;
4309                if ((opValue & 1) == 1) {
4310                    // Dot-matches-All mode.  Jump straight to the end of the string.
4311                    ix = fActiveLimit;
4312                    fHitEnd = TRUE;
4313                } else {
4314                    // NOT DOT ALL mode.  Line endings do not match '.'
4315                    // Scan forward until a line ending or end of input.
4316                    ix = fp->fInputIdx;
4317                    UTEXT_SETNATIVEINDEX(fInputText, ix);
4318                    for (;;) {
4319                        if (ix >= fActiveLimit) {
4320                            fHitEnd = TRUE;
4321                            break;
4322                        }
4323                        UChar32 c = UTEXT_NEXT32(fInputText);
4324                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4325                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4326                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4327                                    (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4328                                //  char is a line ending.  Exit the scanning loop.
4329                                break;
4330                            }
4331                        }
4332                        ix = UTEXT_GETNATIVEINDEX(fInputText);
4333                    }
4334                }
4335
4336                // If there were no matching characters, skip over the loop altogether.
4337                //   The loop doesn't run at all, a * op always succeeds.
4338                if (ix == fp->fInputIdx) {
4339                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4340                    break;
4341                }
4342
4343                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4344                //   must follow.  It's operand is the stack location
4345                //   that holds the starting input index for the match of this .*
4346                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4347                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4348                int32_t stackLoc = URX_VAL(loopcOp);
4349                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4350                fp->fExtra[stackLoc] = fp->fInputIdx;
4351                #ifdef REGEX_SMART_BACKTRACKING
4352                backSearchIndex = fp->fInputIdx;
4353                #endif
4354                fp->fInputIdx = ix;
4355
4356                // Save State to the URX_LOOP_C op that follows this one,
4357                //   so that match failures in the following code will return to there.
4358                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4359                fp = StateSave(fp, fp->fPatIdx, status);
4360                fp->fPatIdx++;
4361            }
4362            break;
4363
4364
4365        case URX_LOOP_C:
4366            {
4367                U_ASSERT(opValue>=0 && opValue<fFrameSize);
4368                backSearchIndex = fp->fExtra[opValue];
4369                U_ASSERT(backSearchIndex <= fp->fInputIdx);
4370                if (backSearchIndex == fp->fInputIdx) {
4371                    // We've backed up the input idx to the point that the loop started.
4372                    // The loop is done.  Leave here without saving state.
4373                    //  Subsequent failures won't come back here.
4374                    break;
4375                }
4376                // Set up for the next iteration of the loop, with input index
4377                //   backed up by one from the last time through,
4378                //   and a state save to this instruction in case the following code fails again.
4379                //   (We're going backwards because this loop emulates stack unwinding, not
4380                //    the initial scan forward.)
4381                U_ASSERT(fp->fInputIdx > 0);
4382                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4383                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4384                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4385
4386                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4387                if (prevC == 0x0a &&
4388                    fp->fInputIdx > backSearchIndex &&
4389                    twoPrevC == 0x0d) {
4390                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4391                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4392                        // .*, stepping back over CRLF pair.
4393                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4394                    }
4395                }
4396
4397
4398                fp = StateSave(fp, fp->fPatIdx-1, status);
4399            }
4400            break;
4401
4402
4403
4404        default:
4405            // Trouble.  The compiled pattern contains an entry with an
4406            //           unrecognized type tag.
4407            U_ASSERT(FALSE);
4408        }
4409
4410        if (U_FAILURE(status)) {
4411            isMatch = FALSE;
4412            break;
4413        }
4414    }
4415
4416breakFromLoop:
4417    fMatch = isMatch;
4418    if (isMatch) {
4419        fLastMatchEnd = fMatchEnd;
4420        fMatchStart   = startIdx;
4421        fMatchEnd     = fp->fInputIdx;
4422        if (fTraceDebug) {
4423            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
4424        }
4425    }
4426    else
4427    {
4428        if (fTraceDebug) {
4429            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4430        }
4431    }
4432
4433    fFrame = fp;                // The active stack frame when the engine stopped.
4434                                //   Contains the capture group results that we need to
4435                                //    access later.
4436    return;
4437}
4438
4439
4440//--------------------------------------------------------------------------------
4441//
4442//   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4443//                  assumption that the entire string is available in the UText's
4444//                  chunk buffer. For now, that means we can use int32_t indexes,
4445//                  except for anything that needs to be saved (like group starts
4446//                  and ends).
4447//
4448//                  startIdx:    begin matching a this index.
4449//                  toEnd:       if true, match must extend to end of the input region
4450//
4451//--------------------------------------------------------------------------------
4452void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4453    UBool       isMatch  = FALSE;      // True if the we have a match.
4454
4455    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4456
4457    int32_t     op;                    // Operation from the compiled pattern, split into
4458    int32_t     opType;                //    the opcode
4459    int32_t     opValue;               //    and the operand value.
4460
4461#ifdef REGEX_RUN_DEBUG
4462    if (fTraceDebug)
4463    {
4464        printf("MatchAt(startIdx=%ld)\n", startIdx);
4465        printf("Original Pattern: ");
4466        UChar32 c = utext_next32From(fPattern->fPattern, 0);
4467        while (c != U_SENTINEL) {
4468            if (c<32 || c>256) {
4469                c = '.';
4470            }
4471            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4472
4473            c = UTEXT_NEXT32(fPattern->fPattern);
4474        }
4475        printf("\n");
4476        printf("Input String: ");
4477        c = utext_next32From(fInputText, 0);
4478        while (c != U_SENTINEL) {
4479            if (c<32 || c>256) {
4480                c = '.';
4481            }
4482            printf("%c", c);
4483
4484            c = UTEXT_NEXT32(fInputText);
4485        }
4486        printf("\n");
4487        printf("\n");
4488    }
4489#endif
4490
4491    if (U_FAILURE(status)) {
4492        return;
4493    }
4494
4495    //  Cache frequently referenced items from the compiled pattern
4496    //
4497    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4498
4499    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4500    UVector             *sets          = fPattern->fSets;
4501
4502    const UChar         *inputBuf      = fInputText->chunkContents;
4503
4504    fFrameSize = fPattern->fFrameSize;
4505    REStackFrame        *fp            = resetStack();
4506
4507    fp->fPatIdx   = 0;
4508    fp->fInputIdx = startIdx;
4509
4510    // Zero out the pattern's static data
4511    int32_t i;
4512    for (i = 0; i<fPattern->fDataSize; i++) {
4513        fData[i] = 0;
4514    }
4515
4516    //
4517    //  Main loop for interpreting the compiled pattern.
4518    //  One iteration of the loop per pattern operation performed.
4519    //
4520    for (;;) {
4521#if 0
4522        if (_heapchk() != _HEAPOK) {
4523            fprintf(stderr, "Heap Trouble\n");
4524        }
4525#endif
4526
4527        op      = (int32_t)pat[fp->fPatIdx];
4528        opType  = URX_TYPE(op);
4529        opValue = URX_VAL(op);
4530#ifdef REGEX_RUN_DEBUG
4531        if (fTraceDebug) {
4532            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4533            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
4534                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4535            fPattern->dumpOp(fp->fPatIdx);
4536        }
4537#endif
4538        fp->fPatIdx++;
4539
4540        switch (opType) {
4541
4542
4543        case URX_NOP:
4544            break;
4545
4546
4547        case URX_BACKTRACK:
4548            // Force a backtrack.  In some circumstances, the pattern compiler
4549            //   will notice that the pattern can't possibly match anything, and will
4550            //   emit one of these at that point.
4551            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552            break;
4553
4554
4555        case URX_ONECHAR:
4556            if (fp->fInputIdx < fActiveLimit) {
4557                UChar32 c;
4558                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4559                if (c == opValue) {
4560                    break;
4561                }
4562            } else {
4563                fHitEnd = TRUE;
4564            }
4565
4566            #ifdef REGEX_SMART_BACKTRACKING
4567            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
4568                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4569                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4570                    int64_t reverseIndex = fp->fInputIdx;
4571                    UChar32 c;
4572                    do {
4573                        U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4574                        if (c == opValue) {
4575                            break;
4576                        }
4577                    } while (reverseIndex > backSearchIndex);
4578                    if (c == opValue) {
4579                        fHitEnd = FALSE;
4580                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4581                        fp->fInputIdx = reverseIndex;
4582                        if (fp->fInputIdx > backSearchIndex) {
4583                            fp = StateSave(fp, fp->fPatIdx, status);
4584                        }
4585                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
4586                        break;
4587                    }
4588                }
4589            }
4590            #endif
4591
4592            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4593            break;
4594
4595
4596        case URX_STRING:
4597            {
4598                // Test input against a literal string.
4599                // Strings require two slots in the compiled pattern, one for the
4600                //   offset to the string text, and one for the length.
4601                int32_t   stringStartIdx = opValue;
4602                int32_t   stringLen;
4603
4604                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4605                fp->fPatIdx++;
4606                opType    = URX_TYPE(op);
4607                stringLen = URX_VAL(op);
4608                U_ASSERT(opType == URX_STRING_LEN);
4609                U_ASSERT(stringLen >= 2);
4610
4611                if (fp->fInputIdx + stringLen > fActiveLimit) {
4612                    // No match.  String is longer than the remaining input text.
4613                    fHitEnd = TRUE;          //   TODO:  See ticket 6074
4614                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615                    break;
4616                }
4617
4618                const UChar * pInp = inputBuf + fp->fInputIdx;
4619                const UChar * pPat = litText+stringStartIdx;
4620                const UChar * pEnd = pInp + stringLen;
4621                UBool success = FALSE;
4622                for(;;) {
4623                    if (*pInp == *pPat) {
4624                        pInp++;
4625                        pPat++;
4626                        if (pInp == pEnd) {
4627                            // Successful Match.
4628                            success = TRUE;
4629                            break;
4630                        }
4631                    } else {
4632                        // Match failed.
4633                        break;
4634                    }
4635                }
4636
4637                if (success) {
4638                    fp->fInputIdx += stringLen;
4639                } else {
4640                    #ifdef REGEX_SMART_BACKTRACKING
4641                    if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4642                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4643                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4644                            // Reset to last start point
4645                            int64_t reverseIndex = fp->fInputIdx;
4646                            UChar32 c;
4647                            pPat = litText+stringStartIdx;
4648
4649                            // Search backwards for a possible start
4650                            do {
4651                                U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4652                                if ((U_IS_BMP(c) && *pPat == c) ||
4653                                    (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
4654                                    success = TRUE;
4655                                    break;
4656                                }
4657                            } while (reverseIndex > backSearchIndex);
4658
4659                            // And try again
4660                            if (success) {
4661                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4662                                fp->fInputIdx = reverseIndex;
4663                                if (fp->fInputIdx > backSearchIndex) {
4664                                    fp = StateSave(fp, fp->fPatIdx, status);
4665                                }
4666                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
4667                                break;
4668                            }
4669                        }
4670                    }
4671                    #endif
4672                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4673                }
4674            }
4675            break;
4676
4677
4678        case URX_STATE_SAVE:
4679            fp = StateSave(fp, opValue, status);
4680            break;
4681
4682
4683        case URX_END:
4684            // The match loop will exit via this path on a successful match,
4685            //   when we reach the end of the pattern.
4686            if (toEnd && fp->fInputIdx != fActiveLimit) {
4687                // The pattern matched, but not to the end of input.  Try some more.
4688                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4689                break;
4690            }
4691            isMatch = TRUE;
4692            goto  breakFromLoop;
4693
4694            // Start and End Capture stack frame variables are laid out out like this:
4695            //  fp->fExtra[opValue]  - The start of a completed capture group
4696            //             opValue+1 - The end   of a completed capture group
4697            //             opValue+2 - the start of a capture group whose end
4698            //                          has not yet been reached (and might not ever be).
4699        case URX_START_CAPTURE:
4700            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4701            fp->fExtra[opValue+2] = fp->fInputIdx;
4702            break;
4703
4704
4705        case URX_END_CAPTURE:
4706            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4707            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4708            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4709            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4710            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4711            break;
4712
4713
4714        case URX_DOLLAR:                   //  $, test for End of line
4715            //     or for position before new line at end of input
4716            if (fp->fInputIdx < fAnchorLimit-2) {
4717                // We are no where near the end of input.  Fail.
4718                //   This is the common case.  Keep it first.
4719                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4720                break;
4721            }
4722            if (fp->fInputIdx >= fAnchorLimit) {
4723                // We really are at the end of input.  Success.
4724                fHitEnd = TRUE;
4725                fRequireEnd = TRUE;
4726                break;
4727            }
4728
4729            // If we are positioned just before a new-line that is located at the
4730            //   end of input, succeed.
4731            if (fp->fInputIdx == fAnchorLimit-1) {
4732                UChar32 c;
4733                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4734
4735                if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4736                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4737                        // At new-line at end of input. Success
4738                        fHitEnd = TRUE;
4739                        fRequireEnd = TRUE;
4740                        break;
4741                    }
4742                }
4743            } else if (fp->fInputIdx == fAnchorLimit-2 &&
4744                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4745                    fHitEnd = TRUE;
4746                    fRequireEnd = TRUE;
4747                    break;                         // At CR/LF at end of input.  Success
4748            }
4749
4750            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4751
4752            break;
4753
4754
4755        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4756            if (fp->fInputIdx >= fAnchorLimit-1) {
4757                // Either at the last character of input, or off the end.
4758                if (fp->fInputIdx == fAnchorLimit-1) {
4759                    // At last char of input.  Success if it's a new line.
4760                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4761                        fHitEnd = TRUE;
4762                        fRequireEnd = TRUE;
4763                        break;
4764                    }
4765                } else {
4766                    // Off the end of input.  Success.
4767                    fHitEnd = TRUE;
4768                    fRequireEnd = TRUE;
4769                    break;
4770                }
4771            }
4772
4773            // Not at end of input.  Back-track out.
4774            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4775            break;
4776
4777
4778        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4779            {
4780                if (fp->fInputIdx >= fAnchorLimit) {
4781                    // We really are at the end of input.  Success.
4782                    fHitEnd = TRUE;
4783                    fRequireEnd = TRUE;
4784                    break;
4785                }
4786                // If we are positioned just before a new-line, succeed.
4787                // It makes no difference where the new-line is within the input.
4788                UChar32 c = inputBuf[fp->fInputIdx];
4789                if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4790                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4791                    //  In multi-line mode, hitting a new-line just before the end of input does not
4792                    //   set the hitEnd or requireEnd flags
4793                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4794                        break;
4795                    }
4796                }
4797                // not at a new line.  Fail.
4798                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4799            }
4800            break;
4801
4802
4803        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4804            {
4805                if (fp->fInputIdx >= fAnchorLimit) {
4806                    // We really are at the end of input.  Success.
4807                    fHitEnd = TRUE;
4808                    fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4809                    break;               //   adding a new-line would not lose the match.
4810                }
4811                // If we are not positioned just before a new-line, the test fails; backtrack out.
4812                // It makes no difference where the new-line is within the input.
4813                if (inputBuf[fp->fInputIdx] != 0x0a) {
4814                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4815                }
4816            }
4817            break;
4818
4819
4820        case URX_CARET:                    //  ^, test for start of line
4821            if (fp->fInputIdx != fAnchorStart) {
4822                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4823            }
4824            break;
4825
4826
4827        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4828            {
4829                if (fp->fInputIdx == fAnchorStart) {
4830                    // We are at the start input.  Success.
4831                    break;
4832                }
4833                // Check whether character just before the current pos is a new-line
4834                //   unless we are at the end of input
4835                UChar  c = inputBuf[fp->fInputIdx - 1];
4836                if ((fp->fInputIdx < fAnchorLimit) &&
4837                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4838                    //  It's a new-line.  ^ is true.  Success.
4839                    //  TODO:  what should be done with positions between a CR and LF?
4840                    break;
4841                }
4842                // Not at the start of a line.  Fail.
4843                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4844            }
4845            break;
4846
4847
4848        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4849            {
4850                U_ASSERT(fp->fInputIdx >= fAnchorStart);
4851                if (fp->fInputIdx <= fAnchorStart) {
4852                    // We are at the start input.  Success.
4853                    break;
4854                }
4855                // Check whether character just before the current pos is a new-line
4856                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4857                UChar  c = inputBuf[fp->fInputIdx - 1];
4858                if (c != 0x0a) {
4859                    // Not at the start of a line.  Back-track out.
4860                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4861                }
4862            }
4863            break;
4864
4865        case URX_BACKSLASH_B:          // Test for word boundaries
4866            {
4867                UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4868                success ^= (opValue != 0);     // flip sense for \B
4869                if (!success) {
4870                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4871                }
4872            }
4873            break;
4874
4875
4876        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4877            {
4878                UBool success = isUWordBoundary(fp->fInputIdx);
4879                success ^= (opValue != 0);     // flip sense for \B
4880                if (!success) {
4881                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4882                }
4883            }
4884            break;
4885
4886
4887        case URX_BACKSLASH_D:            // Test for decimal digit
4888            {
4889                if (fp->fInputIdx >= fActiveLimit) {
4890                    fHitEnd = TRUE;
4891                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4892                    break;
4893                }
4894
4895                UChar32 c;
4896                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4897                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4898                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4899                success ^= (opValue != 0);        // flip sense for \D
4900                if (!success) {
4901                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4902                }
4903            }
4904            break;
4905
4906
4907        case URX_BACKSLASH_G:          // Test for position at end of previous match
4908            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4909                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4910            }
4911            break;
4912
4913
4914        case URX_BACKSLASH_X:
4915        //  Match a Grapheme, as defined by Unicode TR 29.
4916        //  Differs slightly from Perl, which consumes combining marks independently
4917        //    of context.
4918        {
4919
4920            // Fail if at end of input
4921            if (fp->fInputIdx >= fActiveLimit) {
4922                fHitEnd = TRUE;
4923                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4924                break;
4925            }
4926
4927            // Examine (and consume) the current char.
4928            //   Dispatch into a little state machine, based on the char.
4929            UChar32  c;
4930            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4931            UnicodeSet **sets = fPattern->fStaticSets;
4932            if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4933            if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4934            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4935            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4936            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4937            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4938            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4939            goto GC_Extend;
4940
4941
4942
4943GC_L:
4944            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4945            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4946            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4947            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4948            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4949            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4950            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4951            goto GC_Extend;
4952
4953GC_V:
4954            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4955            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4956            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4957            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4958            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4959            goto GC_Extend;
4960
4961GC_T:
4962            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4963            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4964            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4965            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4966            goto GC_Extend;
4967
4968GC_Extend:
4969            // Combining characters are consumed here
4970            for (;;) {
4971                if (fp->fInputIdx >= fActiveLimit) {
4972                    break;
4973                }
4974                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4975                if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4976                    U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4977                    break;
4978                }
4979            }
4980            goto GC_Done;
4981
4982GC_Control:
4983            // Most control chars stand alone (don't combine with combining chars),
4984            //   except for that CR/LF sequence is a single grapheme cluster.
4985            if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4986                fp->fInputIdx++;
4987            }
4988
4989GC_Done:
4990            if (fp->fInputIdx >= fActiveLimit) {
4991                fHitEnd = TRUE;
4992            }
4993            break;
4994        }
4995
4996
4997
4998
4999        case URX_BACKSLASH_Z:          // Test for end of Input
5000            if (fp->fInputIdx < fAnchorLimit) {
5001                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5002            } else {
5003                fHitEnd = TRUE;
5004                fRequireEnd = TRUE;
5005            }
5006            break;
5007
5008
5009
5010        case URX_STATIC_SETREF:
5011            {
5012                // Test input character against one of the predefined sets
5013                //    (Word Characters, for example)
5014                // The high bit of the op value is a flag for the match polarity.
5015                //    0:   success if input char is in set.
5016                //    1:   success if input char is not in set.
5017                if (fp->fInputIdx >= fActiveLimit) {
5018                    fHitEnd = TRUE;
5019                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5020                    break;
5021                }
5022
5023                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
5024                opValue &= ~URX_NEG_SET;
5025                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5026
5027                UChar32 c;
5028                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5029                if (c < 256) {
5030                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5031                    if (s8->contains(c)) {
5032                        success = !success;
5033                    }
5034                } else {
5035                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
5036                    if (s->contains(c)) {
5037                        success = !success;
5038                    }
5039                }
5040                if (!success) {
5041                    #ifdef REGEX_SMART_BACKTRACKING
5042                    if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5043                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5044                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5045                            // Try to find it, backwards
5046                            int64_t reverseIndex = fp->fInputIdx;
5047                            U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5048                            success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
5049                            do {
5050                                U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5051                                if (c < 256) {
5052                                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5053                                    if (s8->contains(c)) {
5054                                        success = !success;
5055                                    }
5056                                } else {
5057                                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
5058                                    if (s->contains(c)) {
5059                                        success = !success;
5060                                    }
5061                                }
5062                            } while (reverseIndex > backSearchIndex && !success);
5063
5064                            if (success) {
5065                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5066                                fp->fInputIdx = reverseIndex;
5067                                if (fp->fInputIdx > backSearchIndex) {
5068                                    fp = StateSave(fp, fp->fPatIdx, status);
5069                                }
5070                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
5071                                break;
5072                            }
5073                        }
5074                    }
5075                    #endif
5076                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5077                }
5078            }
5079            break;
5080
5081
5082        case URX_STAT_SETREF_N:
5083            {
5084                // Test input character for NOT being a member of  one of
5085                //    the predefined sets (Word Characters, for example)
5086                if (fp->fInputIdx >= fActiveLimit) {
5087                    fHitEnd = TRUE;
5088                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5089                    break;
5090                }
5091
5092                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5093
5094                UChar32  c;
5095                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5096                if (c < 256) {
5097                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5098                    if (s8->contains(c) == FALSE) {
5099                        break;
5100                    }
5101                } else {
5102                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
5103                    if (s->contains(c) == FALSE) {
5104                        break;
5105                    }
5106                }
5107
5108                #ifdef REGEX_SMART_BACKTRACKING
5109                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5110                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5111                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5112                        // Try to find it, backwards
5113                        int64_t reverseIndex = fp->fInputIdx;
5114                        U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5115                        UBool success = FALSE;
5116                        do {
5117                            U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5118                            if (c < 256) {
5119                                Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5120                                if (s8->contains(c) == FALSE) {
5121                                    success = TRUE;
5122                                    break;
5123                                }
5124                            } else {
5125                                const UnicodeSet *s = fPattern->fStaticSets[opValue];
5126                                if (s->contains(c) == FALSE) {
5127                                    success = TRUE;
5128                                    break;
5129                                }
5130                            }
5131                        } while (reverseIndex > backSearchIndex);
5132
5133                        if (success) {
5134                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5135                            fp->fInputIdx = reverseIndex;
5136                            if (fp->fInputIdx > backSearchIndex) {
5137                                fp = StateSave(fp, fp->fPatIdx, status);
5138                            }
5139                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
5140                            break;
5141                        }
5142                    }
5143                }
5144                #endif
5145                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5146            }
5147            break;
5148
5149
5150        case URX_SETREF:
5151            {
5152                if (fp->fInputIdx >= fActiveLimit) {
5153                    fHitEnd = TRUE;
5154                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5155                    break;
5156                }
5157
5158                U_ASSERT(opValue > 0 && opValue < sets->size());
5159
5160                // There is input left.  Pick up one char and test it for set membership.
5161                UChar32  c;
5162                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5163                if (c<256) {
5164                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5165                    if (s8->contains(c)) {
5166                        // The character is in the set.  A Match.
5167                        break;
5168                    }
5169                } else {
5170                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5171                    if (s->contains(c)) {
5172                        // The character is in the set.  A Match.
5173                        break;
5174                    }
5175                }
5176
5177                // the character wasn't in the set.
5178                #ifdef REGEX_SMART_BACKTRACKING
5179                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5180                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5181                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5182                        // Try to find it, backwards
5183                        int64_t reverseIndex = fp->fInputIdx;
5184                        U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5185                        UBool success = FALSE;
5186                        do {
5187                            U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5188                            if (c < 256) {
5189                                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5190                                if (s8->contains(c)) {
5191                                    success = TRUE;
5192                                    break;
5193                                }
5194                            } else {
5195                                UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5196                                if (s->contains(c)) {
5197                                    success = TRUE;
5198                                    break;
5199                                }
5200                            }
5201                        } while (reverseIndex > backSearchIndex);
5202
5203                        if (success) {
5204                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5205                            fp->fInputIdx = reverseIndex;
5206                            if (fp->fInputIdx > reverseIndex) {
5207                                fp = StateSave(fp, fp->fPatIdx, status);
5208                            }
5209                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
5210                            break;
5211                        }
5212                    }
5213                }
5214                #endif
5215                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5216            }
5217            break;
5218
5219
5220        case URX_DOTANY:
5221            {
5222                // . matches anything, but stops at end-of-line.
5223                if (fp->fInputIdx >= fActiveLimit) {
5224                    // At end of input.  Match failed.  Backtrack out.
5225                    fHitEnd = TRUE;
5226                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5227                    break;
5228                }
5229
5230                // There is input left.  Advance over one char, unless we've hit end-of-line
5231                UChar32  c;
5232                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5233                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
5234                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
5235                    // End of line in normal mode.   . does not match.
5236                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5237                    break;
5238                }
5239            }
5240            break;
5241
5242
5243        case URX_DOTANY_ALL:
5244            {
5245                // . in dot-matches-all (including new lines) mode
5246                if (fp->fInputIdx >= fActiveLimit) {
5247                    // At end of input.  Match failed.  Backtrack out.
5248                    fHitEnd = TRUE;
5249                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5250                    break;
5251                }
5252
5253                // There is input left.  Advance over one char, except if we are
5254                //   at a cr/lf, advance over both of them.
5255                UChar32 c;
5256                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5257                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5258                    // In the case of a CR/LF, we need to advance over both.
5259                    if (inputBuf[fp->fInputIdx] == 0x0a) {
5260                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5261                    }
5262                }
5263            }
5264            break;
5265
5266
5267        case URX_DOTANY_UNIX:
5268            {
5269                // '.' operator, matches all, but stops at end-of-line.
5270                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
5271                if (fp->fInputIdx >= fActiveLimit) {
5272                    // At end of input.  Match failed.  Backtrack out.
5273                    fHitEnd = TRUE;
5274                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5275                    break;
5276                }
5277
5278                // There is input left.  Advance over one char, unless we've hit end-of-line
5279                UChar32 c;
5280                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5281                if (c == 0x0a) {
5282                    // End of line in normal mode.   '.' does not match the \n
5283                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5284                }
5285            }
5286            break;
5287
5288
5289        case URX_JMP:
5290            fp->fPatIdx = opValue;
5291            break;
5292
5293        case URX_FAIL:
5294            isMatch = FALSE;
5295            goto breakFromLoop;
5296
5297        case URX_JMP_SAV:
5298            U_ASSERT(opValue < fPattern->fCompiledPat->size());
5299            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5300            fp->fPatIdx = opValue;                         // Then JMP.
5301            break;
5302
5303        case URX_JMP_SAV_X:
5304            // This opcode is used with (x)+, when x can match a zero length string.
5305            // Same as JMP_SAV, except conditional on the match having made forward progress.
5306            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5307            //   data address of the input position at the start of the loop.
5308            {
5309                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5310                int32_t  stoOp = (int32_t)pat[opValue-1];
5311                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5312                int32_t  frameLoc = URX_VAL(stoOp);
5313                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5314                int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5315                U_ASSERT(prevInputIdx <= fp->fInputIdx);
5316                if (prevInputIdx < fp->fInputIdx) {
5317                    // The match did make progress.  Repeat the loop.
5318                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5319                    fp->fPatIdx = opValue;
5320                    fp->fExtra[frameLoc] = fp->fInputIdx;
5321                }
5322                // If the input position did not advance, we do nothing here,
5323                //   execution will fall out of the loop.
5324            }
5325            break;
5326
5327        case URX_CTR_INIT:
5328            {
5329                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5330                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5331
5332                // Pick up the three extra operands that CTR_INIT has, and
5333                //    skip the pattern location counter past
5334                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5335                fp->fPatIdx += 3;
5336                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5337                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5338                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5339                U_ASSERT(minCount>=0);
5340                U_ASSERT(maxCount>=minCount || maxCount==-1);
5341                U_ASSERT(loopLoc>fp->fPatIdx);
5342
5343                if (minCount == 0) {
5344                    fp = StateSave(fp, loopLoc+1, status);
5345                }
5346                if (maxCount == 0) {
5347                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5348                }
5349            }
5350            break;
5351
5352        case URX_CTR_LOOP:
5353            {
5354                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5355                int32_t initOp = (int32_t)pat[opValue];
5356                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5357                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5358                int32_t minCount  = (int32_t)pat[opValue+2];
5359                int32_t maxCount  = (int32_t)pat[opValue+3];
5360                // Increment the counter.  Note: we DIDN'T worry about counter
5361                //   overflow, since the data comes from UnicodeStrings, which
5362                //   stores its length in an int32_t. Do we have to think about
5363                //   this now that we're using UText? Probably not, since the length
5364                //   in UChar32s is still an int32_t.
5365                (*pCounter)++;
5366                U_ASSERT(*pCounter > 0);
5367                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5368                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
5369                    break;
5370                }
5371                if (*pCounter >= minCount) {
5372                    fp = StateSave(fp, fp->fPatIdx, status);
5373                }
5374                fp->fPatIdx = opValue + 4;    // Loop back.
5375            }
5376            break;
5377
5378        case URX_CTR_INIT_NG:
5379            {
5380                // Initialize a non-greedy loop
5381                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5382                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5383
5384                // Pick up the three extra operands that CTR_INIT has, and
5385                //    skip the pattern location counter past
5386                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5387                fp->fPatIdx += 3;
5388                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5389                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5390                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5391                U_ASSERT(minCount>=0);
5392                U_ASSERT(maxCount>=minCount || maxCount==-1);
5393                U_ASSERT(loopLoc>fp->fPatIdx);
5394
5395                if (minCount == 0) {
5396                    if (maxCount != 0) {
5397                        fp = StateSave(fp, fp->fPatIdx, status);
5398                    }
5399                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5400                }
5401            }
5402            break;
5403
5404        case URX_CTR_LOOP_NG:
5405            {
5406                // Non-greedy {min, max} loops
5407                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5408                int32_t initOp = (int32_t)pat[opValue];
5409                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5410                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5411                int32_t minCount  = (int32_t)pat[opValue+2];
5412                int32_t maxCount  = (int32_t)pat[opValue+3];
5413                // Increment the counter.  Note: we DIDN'T worry about counter
5414                //   overflow, since the data comes from UnicodeStrings, which
5415                //   stores its length in an int32_t. Do we have to think about
5416                //   this now that we're using UText? Probably not, since the length
5417                //   in UChar32s is still an int32_t.
5418                (*pCounter)++;
5419                U_ASSERT(*pCounter > 0);
5420
5421                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5422                    // The loop has matched the maximum permitted number of times.
5423                    //   Break out of here with no action.  Matching will
5424                    //   continue with the following pattern.
5425                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
5426                    break;
5427                }
5428
5429                if (*pCounter < minCount) {
5430                    // We haven't met the minimum number of matches yet.
5431                    //   Loop back for another one.
5432                    fp->fPatIdx = opValue + 4;    // Loop back.
5433                } else {
5434                    // We do have the minimum number of matches.
5435                    //   Fall into the following pattern, but first do
5436                    //   a state save to the top of the loop, so that a failure
5437                    //   in the following pattern will try another iteration of the loop.
5438                    fp = StateSave(fp, opValue + 4, status);
5439                }
5440            }
5441            break;
5442
5443        case URX_STO_SP:
5444            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5445            fData[opValue] = fStack->size();
5446            break;
5447
5448        case URX_LD_SP:
5449            {
5450                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5451                int32_t newStackSize = (int32_t)fData[opValue];
5452                U_ASSERT(newStackSize <= fStack->size());
5453                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5454                if (newFP == (int64_t *)fp) {
5455                    break;
5456                }
5457                int32_t i;
5458                for (i=0; i<fFrameSize; i++) {
5459                    newFP[i] = ((int64_t *)fp)[i];
5460                }
5461                fp = (REStackFrame *)newFP;
5462                fStack->setSize(newStackSize);
5463            }
5464            break;
5465
5466        case URX_BACKREF:
5467        case URX_BACKREF_I:
5468            {
5469                U_ASSERT(opValue < fFrameSize);
5470                int64_t groupStartIdx = fp->fExtra[opValue];
5471                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5472                U_ASSERT(groupStartIdx <= groupEndIdx);
5473                int64_t len = groupEndIdx-groupStartIdx;
5474                if (groupStartIdx < 0) {
5475                    // This capture group has not participated in the match thus far,
5476                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5477                }
5478
5479                if (len == 0) {
5480                        //   The capture group match was of an empty string.
5481                        //   Verified by testing:  Perl matches succeed in this case, so
5482                        //   we do too.
5483                        break;
5484                    }
5485
5486                UBool  haveMatch = FALSE;
5487                if (fp->fInputIdx + len <= fActiveLimit) {
5488                    if (opType == URX_BACKREF) {
5489                        if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
5490                            haveMatch = TRUE;
5491                        }
5492                    } else {
5493                        if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
5494                                  (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
5495                            haveMatch = TRUE;
5496                        }
5497                    }
5498                } else {
5499                    // TODO: probably need to do a partial string comparison, and only
5500                    //       set HitEnd if the available input matched.  Ticket #6074
5501                    fHitEnd = TRUE;
5502                }
5503                if (haveMatch) {
5504                    fp->fInputIdx += len;     // Match.  Advance current input position.
5505                } else {
5506                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5507                }
5508            }
5509            break;
5510
5511        case URX_STO_INP_LOC:
5512            {
5513                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5514                fp->fExtra[opValue] = fp->fInputIdx;
5515            }
5516            break;
5517
5518        case URX_JMPX:
5519            {
5520                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5521                fp->fPatIdx += 1;
5522                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5523                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5524                int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5525                U_ASSERT(savedInputIdx <= fp->fInputIdx);
5526                if (savedInputIdx < fp->fInputIdx) {
5527                    fp->fPatIdx = opValue;                               // JMP
5528                } else {
5529                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5530                }
5531            }
5532            break;
5533
5534        case URX_LA_START:
5535            {
5536                // Entering a lookahead block.
5537                // Save Stack Ptr, Input Pos.
5538                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5539                fData[opValue]   = fStack->size();
5540                fData[opValue+1] = fp->fInputIdx;
5541                fActiveStart     = fLookStart;          // Set the match region change for
5542                fActiveLimit     = fLookLimit;          //   transparent bounds.
5543            }
5544            break;
5545
5546        case URX_LA_END:
5547            {
5548                // Leaving a look-ahead block.
5549                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5550                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5551                int32_t stackSize = fStack->size();
5552                int32_t newStackSize = (int32_t)fData[opValue];
5553                U_ASSERT(stackSize >= newStackSize);
5554                if (stackSize > newStackSize) {
5555                    // Copy the current top frame back to the new (cut back) top frame.
5556                    //   This makes the capture groups from within the look-ahead
5557                    //   expression available.
5558                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5559                    int32_t i;
5560                    for (i=0; i<fFrameSize; i++) {
5561                        newFP[i] = ((int64_t *)fp)[i];
5562                    }
5563                    fp = (REStackFrame *)newFP;
5564                    fStack->setSize(newStackSize);
5565                }
5566                fp->fInputIdx = fData[opValue+1];
5567
5568                // Restore the active region bounds in the input string; they may have
5569                //    been changed because of transparent bounds on a Region.
5570                fActiveStart = fRegionStart;
5571                fActiveLimit = fRegionLimit;
5572            }
5573            break;
5574
5575        case URX_ONECHAR_I:
5576            if (fp->fInputIdx < fActiveLimit) {
5577                UChar32 c;
5578                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5579                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5580                    break;
5581                }
5582            } else {
5583                fHitEnd = TRUE;
5584            }
5585
5586            #ifdef REGEX_SMART_BACKTRACKING
5587            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5588                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5589                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5590                    UBool success = FALSE;
5591                    int64_t reverseIndex = fp->fInputIdx;
5592                    UChar32 c;
5593                    while (reverseIndex > backSearchIndex) {
5594                        U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5595                        if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5596                            success = TRUE;
5597                            break;
5598                        } else if (c == U_SENTINEL) {
5599                            break;
5600                        }
5601                    }
5602                    if (success) {
5603                        fHitEnd = FALSE;
5604                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5605                        fp->fInputIdx = reverseIndex;
5606                        if (fp->fInputIdx > backSearchIndex) {
5607                            fp = StateSave(fp, fp->fPatIdx, status);
5608                        }
5609                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
5610                        break;
5611                    }
5612                }
5613            }
5614            #endif
5615
5616            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5617            break;
5618
5619        case URX_STRING_I:
5620            {
5621                // Test input against a literal string.
5622                // Strings require two slots in the compiled pattern, one for the
5623                //   offset to the string text, and one for the length.
5624                const UCaseProps *csp = ucase_getSingleton();
5625                {
5626                    int32_t stringStartIdx, stringLen;
5627                    stringStartIdx = opValue;
5628
5629                    op      = (int32_t)pat[fp->fPatIdx];
5630                    fp->fPatIdx++;
5631                    opType  = URX_TYPE(op);
5632                    opValue = URX_VAL(op);
5633                    U_ASSERT(opType == URX_STRING_LEN);
5634                    stringLen = opValue;
5635
5636                    const UChar *patternChars = litText+stringStartIdx;
5637                    const UChar *patternEnd = patternChars+stringLen;
5638
5639                    const UChar *foldChars = NULL;
5640                    int32_t foldOffset, foldLength;
5641                    UChar32 c;
5642                    UBool c_is_valid = FALSE;
5643
5644                    #ifdef REGEX_SMART_BACKTRACKING
5645                    int32_t originalInputIdx = fp->fInputIdx;
5646                    #endif
5647                    UBool success = TRUE;
5648
5649                    foldOffset = foldLength = 0;
5650
5651                    while (patternChars < patternEnd && success) {
5652                        if (fp->fInputIdx < fActiveLimit) {  // don't read past end of string
5653                            if(foldOffset < foldLength) {
5654                                U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5655                                c_is_valid = TRUE;
5656                            } else {
5657                                // test pre-condition of U16_NEXT: i < length
5658                                U_ASSERT(fp->fInputIdx < fActiveLimit);
5659                                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5660                                c_is_valid = TRUE;
5661                                foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5662                                if(foldLength >= 0) {
5663                                    if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5664                                        foldOffset = 0;
5665                                        U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5666                                    } else {
5667                                        c = foldLength;
5668                                        foldLength = foldOffset; // to avoid reading chars from the folding buffer
5669                                    }
5670                                }
5671                            }
5672                        } else {
5673                          c_is_valid = FALSE;
5674                        }
5675
5676                        if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
5677                            if (U_IS_BMP(c)) {
5678                                success = (*patternChars == c);
5679                                patternChars += 1;
5680                            } else if (patternChars+1 < patternEnd) {
5681                                success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
5682                                patternChars += 2;
5683                            }
5684                        } else {
5685                            success = FALSE;
5686                            fHitEnd = TRUE;          //   TODO:  See ticket 6074
5687                        }
5688                    }
5689
5690                    if (!success) {
5691                        #ifdef REGEX_SMART_BACKTRACKING
5692                        if (fp->fInputIdx > backSearchIndex && fStack->size()) {
5693                            REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5694                            if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5695                                // Reset to last start point
5696                                int64_t reverseIndex = originalInputIdx;
5697                                patternChars = litText+stringStartIdx;
5698
5699                                // Search backwards for a possible start
5700                                do {
5701                                    U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5702                                    foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5703                                    if(foldLength >= 0) {
5704                                        if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5705                                            foldOffset = 0;
5706                                            U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5707                                        } else {
5708                                            c = foldLength;
5709                                            foldLength = foldOffset; // to avoid reading chars from the folding buffer
5710                                        }
5711                                    }
5712
5713                                    if ((U_IS_BMP(c) && *patternChars == c) ||
5714                                           (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
5715                                        success = TRUE;
5716                                        break;
5717                                    }
5718                                } while (reverseIndex > backSearchIndex);
5719
5720                                // And try again
5721                                if (success) {
5722                                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5723                                    fp->fInputIdx = reverseIndex;
5724                                    if (fp->fInputIdx > backSearchIndex) {
5725                                        fp = StateSave(fp, fp->fPatIdx, status);
5726                                    }
5727                                    fp->fPatIdx++; // Skip the LOOP_C, we just did that
5728                                    break;
5729                                }
5730                            }
5731                        }
5732                        #endif
5733                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5734                    }
5735                }
5736            }
5737            break;
5738
5739        case URX_LB_START:
5740            {
5741                // Entering a look-behind block.
5742                // Save Stack Ptr, Input Pos.
5743                //   TODO:  implement transparent bounds.  Ticket #6067
5744                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5745                fData[opValue]   = fStack->size();
5746                fData[opValue+1] = fp->fInputIdx;
5747                // Init the variable containing the start index for attempted matches.
5748                fData[opValue+2] = -1;
5749                // Save input string length, then reset to pin any matches to end at
5750                //   the current position.
5751                fData[opValue+3] = fActiveLimit;
5752                fActiveLimit     = fp->fInputIdx;
5753            }
5754            break;
5755
5756
5757        case URX_LB_CONT:
5758            {
5759                // Positive Look-Behind, at top of loop checking for matches of LB expression
5760                //    at all possible input starting positions.
5761
5762                // Fetch the min and max possible match lengths.  They are the operands
5763                //   of this op in the pattern.
5764                int32_t minML = (int32_t)pat[fp->fPatIdx++];
5765                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5766                U_ASSERT(minML <= maxML);
5767                U_ASSERT(minML >= 0);
5768
5769                // Fetch (from data) the last input index where a match was attempted.
5770                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5771                int64_t  *lbStartIdx = &fData[opValue+2];
5772                if (*lbStartIdx < 0) {
5773                    // First time through loop.
5774                    *lbStartIdx = fp->fInputIdx - minML;
5775                } else {
5776                    // 2nd through nth time through the loop.
5777                    // Back up start position for match by one.
5778                    if (*lbStartIdx == 0) {
5779                        (*lbStartIdx)--;
5780                    } else {
5781                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5782                    }
5783                }
5784
5785                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5786                    // We have tried all potential match starting points without
5787                    //  getting a match.  Backtrack out, and out of the
5788                    //   Look Behind altogether.
5789                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5790                    int64_t restoreInputLen = fData[opValue+3];
5791                    U_ASSERT(restoreInputLen >= fActiveLimit);
5792                    U_ASSERT(restoreInputLen <= fInputLength);
5793                    fActiveLimit = restoreInputLen;
5794                    break;
5795                }
5796
5797                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5798                //      (successful match will fall off the end of the loop.)
5799                fp = StateSave(fp, fp->fPatIdx-3, status);
5800                fp->fInputIdx =  *lbStartIdx;
5801            }
5802            break;
5803
5804        case URX_LB_END:
5805            // End of a look-behind block, after a successful match.
5806            {
5807                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5808                if (fp->fInputIdx != fActiveLimit) {
5809                    //  The look-behind expression matched, but the match did not
5810                    //    extend all the way to the point that we are looking behind from.
5811                    //  FAIL out of here, which will take us back to the LB_CONT, which
5812                    //     will retry the match starting at another position or fail
5813                    //     the look-behind altogether, whichever is appropriate.
5814                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5815                    break;
5816                }
5817
5818                // Look-behind match is good.  Restore the orignal input string length,
5819                //   which had been truncated to pin the end of the lookbehind match to the
5820                //   position being looked-behind.
5821                int64_t originalInputLen = fData[opValue+3];
5822                U_ASSERT(originalInputLen >= fActiveLimit);
5823                U_ASSERT(originalInputLen <= fInputLength);
5824                fActiveLimit = originalInputLen;
5825            }
5826            break;
5827
5828
5829        case URX_LBN_CONT:
5830            {
5831                // Negative Look-Behind, at top of loop checking for matches of LB expression
5832                //    at all possible input starting positions.
5833
5834                // Fetch the extra parameters of this op.
5835                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5836                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5837                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5838                continueLoc = URX_VAL(continueLoc);
5839                U_ASSERT(minML <= maxML);
5840                U_ASSERT(minML >= 0);
5841                U_ASSERT(continueLoc > fp->fPatIdx);
5842
5843                // Fetch (from data) the last input index where a match was attempted.
5844                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5845                int64_t  *lbStartIdx = &fData[opValue+2];
5846                if (*lbStartIdx < 0) {
5847                    // First time through loop.
5848                    *lbStartIdx = fp->fInputIdx - minML;
5849                } else {
5850                    // 2nd through nth time through the loop.
5851                    // Back up start position for match by one.
5852                    if (*lbStartIdx == 0) {
5853                        (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5854                    } else {
5855                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5856                    }
5857                }
5858
5859                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5860                    // We have tried all potential match starting points without
5861                    //  getting a match, which means that the negative lookbehind as
5862                    //  a whole has succeeded.  Jump forward to the continue location
5863                    int64_t restoreInputLen = fData[opValue+3];
5864                    U_ASSERT(restoreInputLen >= fActiveLimit);
5865                    U_ASSERT(restoreInputLen <= fInputLength);
5866                    fActiveLimit = restoreInputLen;
5867                    fp->fPatIdx = continueLoc;
5868                    break;
5869                }
5870
5871                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5872                //      (successful match will cause a FAIL out of the loop altogether.)
5873                fp = StateSave(fp, fp->fPatIdx-4, status);
5874                fp->fInputIdx =  *lbStartIdx;
5875            }
5876            break;
5877
5878        case URX_LBN_END:
5879            // End of a negative look-behind block, after a successful match.
5880            {
5881                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5882                if (fp->fInputIdx != fActiveLimit) {
5883                    //  The look-behind expression matched, but the match did not
5884                    //    extend all the way to the point that we are looking behind from.
5885                    //  FAIL out of here, which will take us back to the LB_CONT, which
5886                    //     will retry the match starting at another position or succeed
5887                    //     the look-behind altogether, whichever is appropriate.
5888                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5889                    break;
5890                }
5891
5892                // Look-behind expression matched, which means look-behind test as
5893                //   a whole Fails
5894
5895                //   Restore the orignal input string length, which had been truncated
5896                //   inorder to pin the end of the lookbehind match
5897                //   to the position being looked-behind.
5898                int64_t originalInputLen = fData[opValue+3];
5899                U_ASSERT(originalInputLen >= fActiveLimit);
5900                U_ASSERT(originalInputLen <= fInputLength);
5901                fActiveLimit = originalInputLen;
5902
5903                // Restore original stack position, discarding any state saved
5904                //   by the successful pattern match.
5905                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5906                int32_t newStackSize = (int32_t)fData[opValue];
5907                U_ASSERT(fStack->size() > newStackSize);
5908                fStack->setSize(newStackSize);
5909
5910                //  FAIL, which will take control back to someplace
5911                //  prior to entering the look-behind test.
5912                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5913            }
5914            break;
5915
5916
5917        case URX_LOOP_SR_I:
5918            // Loop Initialization for the optimized implementation of
5919            //     [some character set]*
5920            //   This op scans through all matching input.
5921            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5922            {
5923                U_ASSERT(opValue > 0 && opValue < sets->size());
5924                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5925                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5926
5927                // Loop through input, until either the input is exhausted or
5928                //   we reach a character that is not a member of the set.
5929                int32_t ix = (int32_t)fp->fInputIdx;
5930                for (;;) {
5931                    if (ix >= fActiveLimit) {
5932                        fHitEnd = TRUE;
5933                        break;
5934                    }
5935                    UChar32   c;
5936                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
5937                    if (c<256) {
5938                        if (s8->contains(c) == FALSE) {
5939                            U16_BACK_1(inputBuf, 0, ix);
5940                            break;
5941                        }
5942                    } else {
5943                        if (s->contains(c) == FALSE) {
5944                            U16_BACK_1(inputBuf, 0, ix);
5945                            break;
5946                        }
5947                    }
5948                }
5949
5950                // If there were no matching characters, skip over the loop altogether.
5951                //   The loop doesn't run at all, a * op always succeeds.
5952                if (ix == fp->fInputIdx) {
5953                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5954                    break;
5955                }
5956
5957                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5958                //   must follow.  It's operand is the stack location
5959                //   that holds the starting input index for the match of this [set]*
5960                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5961                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5962                int32_t stackLoc = URX_VAL(loopcOp);
5963                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5964                fp->fExtra[stackLoc] = fp->fInputIdx;
5965                #ifdef REGEX_SMART_BACKTRACKING
5966                backSearchIndex = fp->fInputIdx;
5967                #endif
5968                fp->fInputIdx = ix;
5969
5970                // Save State to the URX_LOOP_C op that follows this one,
5971                //   so that match failures in the following code will return to there.
5972                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5973                fp = StateSave(fp, fp->fPatIdx, status);
5974                fp->fPatIdx++;
5975            }
5976            break;
5977
5978
5979        case URX_LOOP_DOT_I:
5980            // Loop Initialization for the optimized implementation of .*
5981            //   This op scans through all remaining input.
5982            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5983            {
5984                // Loop through input until the input is exhausted (we reach an end-of-line)
5985                // In DOTALL mode, we can just go straight to the end of the input.
5986                int32_t ix;
5987                if ((opValue & 1) == 1) {
5988                    // Dot-matches-All mode.  Jump straight to the end of the string.
5989                    ix = (int32_t)fActiveLimit;
5990                    fHitEnd = TRUE;
5991                } else {
5992                    // NOT DOT ALL mode.  Line endings do not match '.'
5993                    // Scan forward until a line ending or end of input.
5994                    ix = (int32_t)fp->fInputIdx;
5995                    for (;;) {
5996                        if (ix >= fActiveLimit) {
5997                            fHitEnd = TRUE;
5998                            break;
5999                        }
6000                        UChar32   c;
6001                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
6002                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
6003                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
6004                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
6005                                   ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
6006                                //  char is a line ending.  Put the input pos back to the
6007                                //    line ending char, and exit the scanning loop.
6008                                U16_BACK_1(inputBuf, 0, ix);
6009                                break;
6010                            }
6011                        }
6012                    }
6013                }
6014
6015                // If there were no matching characters, skip over the loop altogether.
6016                //   The loop doesn't run at all, a * op always succeeds.
6017                if (ix == fp->fInputIdx) {
6018                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
6019                    break;
6020                }
6021
6022                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
6023                //   must follow.  It's operand is the stack location
6024                //   that holds the starting input index for the match of this .*
6025                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
6026                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
6027                int32_t stackLoc = URX_VAL(loopcOp);
6028                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
6029                fp->fExtra[stackLoc] = fp->fInputIdx;
6030                #ifdef REGEX_SMART_BACKTRACKING
6031                backSearchIndex = fp->fInputIdx;
6032                #endif
6033                fp->fInputIdx = ix;
6034
6035                // Save State to the URX_LOOP_C op that follows this one,
6036                //   so that match failures in the following code will return to there.
6037                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
6038                fp = StateSave(fp, fp->fPatIdx, status);
6039                fp->fPatIdx++;
6040            }
6041            break;
6042
6043
6044        case URX_LOOP_C:
6045            {
6046                U_ASSERT(opValue>=0 && opValue<fFrameSize);
6047                backSearchIndex = (int32_t)fp->fExtra[opValue];
6048                U_ASSERT(backSearchIndex <= fp->fInputIdx);
6049                if (backSearchIndex == fp->fInputIdx) {
6050                    // We've backed up the input idx to the point that the loop started.
6051                    // The loop is done.  Leave here without saving state.
6052                    //  Subsequent failures won't come back here.
6053                    break;
6054                }
6055                // Set up for the next iteration of the loop, with input index
6056                //   backed up by one from the last time through,
6057                //   and a state save to this instruction in case the following code fails again.
6058                //   (We're going backwards because this loop emulates stack unwinding, not
6059                //    the initial scan forward.)
6060                U_ASSERT(fp->fInputIdx > 0);
6061                UChar32 prevC;
6062                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
6063
6064                if (prevC == 0x0a &&
6065                    fp->fInputIdx > backSearchIndex &&
6066                    inputBuf[fp->fInputIdx-1] == 0x0d) {
6067                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
6068                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
6069                        // .*, stepping back over CRLF pair.
6070                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
6071                    }
6072                }
6073
6074
6075                fp = StateSave(fp, fp->fPatIdx-1, status);
6076            }
6077            break;
6078
6079
6080
6081        default:
6082            // Trouble.  The compiled pattern contains an entry with an
6083            //           unrecognized type tag.
6084            U_ASSERT(FALSE);
6085        }
6086
6087        if (U_FAILURE(status)) {
6088            isMatch = FALSE;
6089            break;
6090        }
6091    }
6092
6093breakFromLoop:
6094    fMatch = isMatch;
6095    if (isMatch) {
6096        fLastMatchEnd = fMatchEnd;
6097        fMatchStart   = startIdx;
6098        fMatchEnd     = fp->fInputIdx;
6099        if (fTraceDebug) {
6100            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
6101        }
6102    }
6103    else
6104    {
6105        if (fTraceDebug) {
6106            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6107        }
6108    }
6109
6110    fFrame = fp;                // The active stack frame when the engine stopped.
6111    //   Contains the capture group results that we need to
6112    //    access later.
6113
6114    return;
6115}
6116
6117
6118UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
6119
6120U_NAMESPACE_END
6121
6122#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
6123