1/*
2**************************************************************************
3*   Copyright (C) 2002-2010 International Business Machines Corporation  *
4*   and others. All rights reserved.                                     *
5**************************************************************************
6*/
7//
8//  file:  rematch.cpp
9//
10//         Contains the implementation of class RegexMatcher,
11//         which is one of the main API classes for the ICU regular expression package.
12//
13
14#include "unicode/utypes.h"
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/regex.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/ustring.h"
21#include "unicode/rbbi.h"
22#include "uassert.h"
23#include "cmemory.h"
24#include "uvector.h"
25#include "uvectr32.h"
26#include "uvectr64.h"
27#include "regeximp.h"
28#include "regexst.h"
29#include "regextxt.h"
30#include "ucase.h"
31
32// #include <malloc.h>        // Needed for heapcheck testing
33
34
35// Find progress callback
36// ----------------------
37// Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
38//
39#define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
40    (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
41
42
43// Smart Backtracking
44// ------------------
45// When a failure would go back to a LOOP_C instruction,
46// strings, characters, and setrefs scan backwards for a valid start
47// character themselves, pop the stack, and save state, emulating the
48// LOOP_C's effect but assured that the next character of input is a
49// possible matching character.
50//
51// Good idea in theory; unfortunately it only helps out a few specific
52// cases and slows the engine down a little in the rest.
53
54//#define REGEX_SMART_BACKTRACKING 1
55
56U_NAMESPACE_BEGIN
57
58// Default limit for the size of the back track stack, to avoid system
59//    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
60// This value puts ICU's limits higher than most other regexp implementations,
61//    which use recursion rather than the heap, and take more storage per
62//    backtrack point.
63//
64static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66// Time limit counter constant.
67//   Time limits for expression evaluation are in terms of quanta of work by
68//   the engine, each of which is 10,000 state saves.
69//   This constant determines that state saves per tick number.
70static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72//-----------------------------------------------------------------------------
73//
74//   Constructor and Destructor
75//
76//-----------------------------------------------------------------------------
77RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
78    fDeferredStatus = U_ZERO_ERROR;
79    init(fDeferredStatus);
80    if (U_FAILURE(fDeferredStatus)) {
81        return;
82    }
83    if (pat==NULL) {
84        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85        return;
86    }
87    fPattern = pat;
88    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89}
90
91
92
93RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94                           uint32_t flags, UErrorCode &status) {
95    init(status);
96    if (U_FAILURE(status)) {
97        return;
98    }
99    UParseError    pe;
100    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
101    fPattern           = fPatternOwned;
102
103    UText inputText = UTEXT_INITIALIZER;
104    utext_openConstUnicodeString(&inputText, &input, &status);
105    init2(&inputText, status);
106    utext_close(&inputText);
107
108    fInputUniStrMaybeMutable = TRUE;
109}
110
111
112RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113                           uint32_t flags, UErrorCode &status) {
114    init(status);
115    if (U_FAILURE(status)) {
116        return;
117    }
118    UParseError    pe;
119    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
120    if (U_FAILURE(status)) {
121        return;
122    }
123
124    fPattern           = fPatternOwned;
125    init2(input, status);
126}
127
128
129RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130                           uint32_t flags, UErrorCode &status) {
131    init(status);
132    if (U_FAILURE(status)) {
133        return;
134    }
135    UParseError    pe;
136    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
137    if (U_FAILURE(status)) {
138        return;
139    }
140    fPattern           = fPatternOwned;
141    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142}
143
144RegexMatcher::RegexMatcher(UText *regexp,
145                           uint32_t flags, UErrorCode &status) {
146    init(status);
147    if (U_FAILURE(status)) {
148        return;
149    }
150    UParseError    pe;
151    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
152        if (U_FAILURE(status)) {
153        return;
154    }
155
156    fPattern           = fPatternOwned;
157    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158}
159
160
161
162
163RegexMatcher::~RegexMatcher() {
164    delete fStack;
165    if (fData != fSmallData) {
166        uprv_free(fData);
167        fData = NULL;
168    }
169    if (fPatternOwned) {
170        delete fPatternOwned;
171        fPatternOwned = NULL;
172        fPattern = NULL;
173    }
174
175    if (fInput) {
176        delete fInput;
177    }
178    if (fInputText) {
179        utext_close(fInputText);
180    }
181    if (fAltInputText) {
182        utext_close(fAltInputText);
183    }
184
185    #if UCONFIG_NO_BREAK_ITERATION==0
186    delete fWordBreakItr;
187    #endif
188}
189
190//
191//   init()   common initialization for use by all constructors.
192//            Initialize all fields, get the object into a consistent state.
193//            This must be done even when the initial status shows an error,
194//            so that the object is initialized sufficiently well for the destructor
195//            to run safely.
196//
197void RegexMatcher::init(UErrorCode &status) {
198    fPattern           = NULL;
199    fPatternOwned      = NULL;
200    fFrameSize         = 0;
201    fRegionStart       = 0;
202    fRegionLimit       = 0;
203    fAnchorStart       = 0;
204    fAnchorLimit       = 0;
205    fLookStart         = 0;
206    fLookLimit         = 0;
207    fActiveStart       = 0;
208    fActiveLimit       = 0;
209    fTransparentBounds = FALSE;
210    fAnchoringBounds   = TRUE;
211    fMatch             = FALSE;
212    fMatchStart        = 0;
213    fMatchEnd          = 0;
214    fLastMatchEnd      = -1;
215    fAppendPosition    = 0;
216    fHitEnd            = FALSE;
217    fRequireEnd        = FALSE;
218    fStack             = NULL;
219    fFrame             = NULL;
220    fTimeLimit         = 0;
221    fTime              = 0;
222    fTickCounter       = 0;
223    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
224    fCallbackFn        = NULL;
225    fCallbackContext   = NULL;
226    fFindProgressCallbackFn      = NULL;
227    fFindProgressCallbackContext = NULL;
228    fTraceDebug        = FALSE;
229    fDeferredStatus    = status;
230    fData              = fSmallData;
231    fWordBreakItr      = NULL;
232
233    fStack             = new UVector64(status);
234    fInputText         = NULL;
235    fAltInputText      = NULL;
236    fInput             = NULL;
237    fInputLength       = 0;
238    fInputUniStrMaybeMutable = FALSE;
239
240    if (U_FAILURE(status)) {
241        fDeferredStatus = status;
242    }
243}
244
245//
246//  init2()   Common initialization for use by RegexMatcher constructors, part 2.
247//            This handles the common setup to be done after the Pattern is available.
248//
249void RegexMatcher::init2(UText *input, UErrorCode &status) {
250    if (U_FAILURE(status)) {
251        fDeferredStatus = status;
252        return;
253    }
254
255    if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256        fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257        if (fData == NULL) {
258            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259            return;
260        }
261    }
262
263    reset(input);
264    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
265    if (U_FAILURE(status)) {
266        fDeferredStatus = status;
267        return;
268    }
269}
270
271
272static const UChar BACKSLASH  = 0x5c;
273static const UChar DOLLARSIGN = 0x24;
274//--------------------------------------------------------------------------------
275//
276//    appendReplacement
277//
278//--------------------------------------------------------------------------------
279RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280                                              const UnicodeString &replacement,
281                                              UErrorCode &status) {
282    UText replacementText = UTEXT_INITIALIZER;
283
284    utext_openConstUnicodeString(&replacementText, &replacement, &status);
285    if (U_SUCCESS(status)) {
286        UText resultText = UTEXT_INITIALIZER;
287        utext_openUnicodeString(&resultText, &dest, &status);
288
289        if (U_SUCCESS(status)) {
290            appendReplacement(&resultText, &replacementText, status);
291            utext_close(&resultText);
292        }
293        utext_close(&replacementText);
294    }
295
296    return *this;
297}
298
299//
300//    appendReplacement, UText mode
301//
302RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303                                              UText *replacement,
304                                              UErrorCode &status) {
305    if (U_FAILURE(status)) {
306        return *this;
307    }
308    if (U_FAILURE(fDeferredStatus)) {
309        status = fDeferredStatus;
310        return *this;
311    }
312    if (fMatch == FALSE) {
313        status = U_REGEX_INVALID_STATE;
314        return *this;
315    }
316
317    // Copy input string from the end of previous match to start of current match
318    int64_t  destLen = utext_nativeLength(dest);
319    if (fMatchStart > fAppendPosition) {
320        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322                                     (int32_t)(fMatchStart-fAppendPosition), &status);
323        } else {
324            int32_t len16;
325            if (UTEXT_USES_U16(fInputText)) {
326                len16 = (int32_t)(fMatchStart-fAppendPosition);
327            } else {
328                UErrorCode lengthStatus = U_ZERO_ERROR;
329                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330            }
331            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332            if (inputChars == NULL) {
333                status = U_MEMORY_ALLOCATION_ERROR;
334                return *this;
335            }
336            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338            uprv_free(inputChars);
339        }
340    }
341    fAppendPosition = fMatchEnd;
342
343
344    // scan the replacement text, looking for substitutions ($n) and \escapes.
345    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
346    //         move entire ranges not containing substitutions.
347    UTEXT_SETNATIVEINDEX(replacement, 0);
348    UChar32 c = UTEXT_NEXT32(replacement);
349    while (c != U_SENTINEL) {
350        if (c == BACKSLASH) {
351            // Backslash Escape.  Copy the following char out without further checks.
352            //                    Note:  Surrogate pairs don't need any special handling
353            //                           The second half wont be a '$' or a '\', and
354            //                           will move to the dest normally on the next
355            //                           loop iteration.
356            c = UTEXT_CURRENT32(replacement);
357            if (c == U_SENTINEL) {
358                break;
359            }
360
361            if (c==0x55/*U*/ || c==0x75/*u*/) {
362                // We have a \udddd or \Udddddddd escape sequence.
363                int32_t offset = 0;
364                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
365                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
366                if (escapedChar != (UChar32)0xFFFFFFFF) {
367                    if (U_IS_BMP(escapedChar)) {
368                        UChar c16 = (UChar)escapedChar;
369                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
370                    } else {
371                        UChar surrogate[2];
372                        surrogate[0] = U16_LEAD(escapedChar);
373                        surrogate[1] = U16_TRAIL(escapedChar);
374                        if (U_SUCCESS(status)) {
375                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
376                        }
377                    }
378                    // TODO:  Report errors for mal-formed \u escapes?
379                    //        As this is, the original sequence is output, which may be OK.
380                    if (context.lastOffset == offset) {
381                        UTEXT_PREVIOUS32(replacement);
382                    } else if (context.lastOffset != offset-1) {
383                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384                    }
385                }
386            } else {
387                UTEXT_NEXT32(replacement);
388                // Plain backslash escape.  Just put out the escaped character.
389                if (U_IS_BMP(c)) {
390                    UChar c16 = (UChar)c;
391                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
392                } else {
393                    UChar surrogate[2];
394                    surrogate[0] = U16_LEAD(c);
395                    surrogate[1] = U16_TRAIL(c);
396                    if (U_SUCCESS(status)) {
397                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398                    }
399                }
400            }
401        } else if (c != DOLLARSIGN) {
402            // Normal char, not a $.  Copy it out without further checks.
403            if (U_IS_BMP(c)) {
404                UChar c16 = (UChar)c;
405                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
406            } else {
407                UChar surrogate[2];
408                surrogate[0] = U16_LEAD(c);
409                surrogate[1] = U16_TRAIL(c);
410                if (U_SUCCESS(status)) {
411                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412                }
413            }
414        } else {
415            // We've got a $.  Pick up a capture group number if one follows.
416            // Consume at most the number of digits necessary for the largest capture
417            // number that is valid for this pattern.
418
419            int32_t numDigits = 0;
420            int32_t groupNum  = 0;
421            UChar32 digitC;
422            for (;;) {
423                digitC = UTEXT_CURRENT32(replacement);
424                if (digitC == U_SENTINEL) {
425                    break;
426                }
427                if (u_isdigit(digitC) == FALSE) {
428                    break;
429                }
430                UTEXT_NEXT32(replacement);
431                groupNum=groupNum*10 + u_charDigitValue(digitC);
432                numDigits++;
433                if (numDigits >= fPattern->fMaxCaptureDigits) {
434                    break;
435                }
436            }
437
438
439            if (numDigits == 0) {
440                // The $ didn't introduce a group number at all.
441                // Treat it as just part of the substitution text.
442                UChar c16 = DOLLARSIGN;
443                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
444            } else {
445                // Finally, append the capture group data to the destination.
446                destLen += appendGroup(groupNum, dest, status);
447                if (U_FAILURE(status)) {
448                    // Can fail if group number is out of range.
449                    break;
450                }
451            }
452        }
453
454        if (U_FAILURE(status)) {
455            break;
456        } else {
457            c = UTEXT_NEXT32(replacement);
458        }
459    }
460
461    return *this;
462}
463
464
465
466//--------------------------------------------------------------------------------
467//
468//    appendTail     Intended to be used in conjunction with appendReplacement()
469//                   To the destination string, append everything following
470//                   the last match position from the input string.
471//
472//                   Note:  Match ranges do not affect appendTail or appendReplacement
473//
474//--------------------------------------------------------------------------------
475UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
476    UErrorCode status = U_ZERO_ERROR;
477    UText resultText = UTEXT_INITIALIZER;
478    utext_openUnicodeString(&resultText, &dest, &status);
479
480    if (U_SUCCESS(status)) {
481        appendTail(&resultText, status);
482        utext_close(&resultText);
483    }
484
485    return dest;
486}
487
488//
489//   appendTail, UText mode
490//
491UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
492    UBool bailOut = FALSE;
493    if (U_FAILURE(status)) {
494        bailOut = TRUE;
495    }
496    if (U_FAILURE(fDeferredStatus)) {
497        status = fDeferredStatus;
498        bailOut = TRUE;
499    }
500
501    if (bailOut) {
502        //  dest must not be NULL
503        if (dest) {
504            utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
505            return dest;
506        }
507    }
508
509    if (fInputLength > fAppendPosition) {
510        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
511            int64_t destLen = utext_nativeLength(dest);
512            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
513                          (int32_t)(fInputLength-fAppendPosition), &status);
514        } else {
515            int32_t len16;
516            if (UTEXT_USES_U16(fInputText)) {
517                len16 = (int32_t)(fInputLength-fAppendPosition);
518            } else {
519                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
520                status = U_ZERO_ERROR; // buffer overflow
521            }
522
523            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
524            if (inputChars == NULL) {
525                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
526            } else {
527                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
528                int64_t destLen = utext_nativeLength(dest);
529                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
530                uprv_free(inputChars);
531            }
532        }
533    }
534    return dest;
535}
536
537
538
539//--------------------------------------------------------------------------------
540//
541//   end
542//
543//--------------------------------------------------------------------------------
544int32_t RegexMatcher::end(UErrorCode &err) const {
545    return end(0, err);
546}
547
548int64_t RegexMatcher::end64(UErrorCode &err) const {
549    return end64(0, err);
550}
551
552int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
553    if (U_FAILURE(err)) {
554        return -1;
555    }
556    if (fMatch == FALSE) {
557        err = U_REGEX_INVALID_STATE;
558        return -1;
559    }
560    if (group < 0 || group > fPattern->fGroupMap->size()) {
561        err = U_INDEX_OUTOFBOUNDS_ERROR;
562        return -1;
563    }
564    int64_t e = -1;
565    if (group == 0) {
566        e = fMatchEnd;
567    } else {
568        // Get the position within the stack frame of the variables for
569        //    this capture group.
570        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
571        U_ASSERT(groupOffset < fPattern->fFrameSize);
572        U_ASSERT(groupOffset >= 0);
573        e = fFrame->fExtra[groupOffset + 1];
574    }
575
576        return e;
577}
578
579int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
580    return (int32_t)end64(group, err);
581}
582
583
584//--------------------------------------------------------------------------------
585//
586//   find()
587//
588//--------------------------------------------------------------------------------
589UBool RegexMatcher::find() {
590    // Start at the position of the last match end.  (Will be zero if the
591    //   matcher has been reset.)
592    //
593    if (U_FAILURE(fDeferredStatus)) {
594        return FALSE;
595    }
596
597    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
598        return findUsingChunk();
599    }
600
601    int64_t startPos = fMatchEnd;
602    if (startPos==0) {
603        startPos = fActiveStart;
604    }
605
606    if (fMatch) {
607        // Save the position of any previous successful match.
608        fLastMatchEnd = fMatchEnd;
609
610        if (fMatchStart == fMatchEnd) {
611            // Previous match had zero length.  Move start position up one position
612            //  to avoid sending find() into a loop on zero-length matches.
613            if (startPos >= fActiveLimit) {
614                fMatch = FALSE;
615                fHitEnd = TRUE;
616                return FALSE;
617            }
618            UTEXT_SETNATIVEINDEX(fInputText, startPos);
619            UTEXT_NEXT32(fInputText);
620            startPos = UTEXT_GETNATIVEINDEX(fInputText);
621        }
622    } else {
623        if (fLastMatchEnd >= 0) {
624            // A previous find() failed to match.  Don't try again.
625            //   (without this test, a pattern with a zero-length match
626            //    could match again at the end of an input string.)
627            fHitEnd = TRUE;
628            return FALSE;
629        }
630    }
631
632
633    // Compute the position in the input string beyond which a match can not begin, because
634    //   the minimum length match would extend past the end of the input.
635    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
636    //          Be aware of possible overflows if making changes here.
637    int64_t testStartLimit;
638    if (UTEXT_USES_U16(fInputText)) {
639        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
640        if (startPos > testStartLimit) {
641            fMatch = FALSE;
642            fHitEnd = TRUE;
643            return FALSE;
644        }
645    } else {
646        // For now, let the matcher discover that it can't match on its own
647        // We don't know how long the match len is in native characters
648        testStartLimit = fActiveLimit;
649    }
650
651    UChar32  c;
652    U_ASSERT(startPos >= 0);
653
654    switch (fPattern->fStartType) {
655    case START_NO_INFO:
656        // No optimization was found.
657        //  Try a match at each input position.
658        for (;;) {
659            MatchAt(startPos, FALSE, fDeferredStatus);
660            if (U_FAILURE(fDeferredStatus)) {
661                return FALSE;
662            }
663            if (fMatch) {
664                return TRUE;
665            }
666            if (startPos >= testStartLimit) {
667                fHitEnd = TRUE;
668                return FALSE;
669            }
670            UTEXT_SETNATIVEINDEX(fInputText, startPos);
671            UTEXT_NEXT32(fInputText);
672            startPos = UTEXT_GETNATIVEINDEX(fInputText);
673            // Note that it's perfectly OK for a pattern to have a zero-length
674            //   match at the end of a string, so we must make sure that the loop
675            //   runs with startPos == testStartLimit the last time through.
676            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
677                return FALSE;
678        }
679        U_ASSERT(FALSE);
680
681    case START_START:
682        // Matches are only possible at the start of the input string
683        //   (pattern begins with ^ or \A)
684        if (startPos > fActiveStart) {
685            fMatch = FALSE;
686            return FALSE;
687        }
688        MatchAt(startPos, FALSE, fDeferredStatus);
689        if (U_FAILURE(fDeferredStatus)) {
690            return FALSE;
691        }
692        return fMatch;
693
694
695    case START_SET:
696        {
697            // Match may start on any char from a pre-computed set.
698            U_ASSERT(fPattern->fMinMatchLen > 0);
699            int64_t pos;
700            UTEXT_SETNATIVEINDEX(fInputText, startPos);
701            for (;;) {
702                c = UTEXT_NEXT32(fInputText);
703                pos = UTEXT_GETNATIVEINDEX(fInputText);
704                // c will be -1 (U_SENTINEL) at end of text, in which case we
705                // skip this next block (so we don't have a negative array index)
706                // and handle end of text in the following block.
707                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
708                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
709                    MatchAt(startPos, FALSE, fDeferredStatus);
710                    if (U_FAILURE(fDeferredStatus)) {
711                        return FALSE;
712                    }
713                    if (fMatch) {
714                        return TRUE;
715                    }
716                    UTEXT_SETNATIVEINDEX(fInputText, pos);
717                }
718                if (startPos >= testStartLimit) {
719                    fMatch = FALSE;
720                    fHitEnd = TRUE;
721                    return FALSE;
722                }
723                startPos = pos;
724	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
725                    return FALSE;
726            }
727        }
728        U_ASSERT(FALSE);
729
730    case START_STRING:
731    case START_CHAR:
732        {
733            // Match starts on exactly one char.
734            U_ASSERT(fPattern->fMinMatchLen > 0);
735            UChar32 theChar = fPattern->fInitialChar;
736            int64_t pos;
737            UTEXT_SETNATIVEINDEX(fInputText, startPos);
738            for (;;) {
739                c = UTEXT_NEXT32(fInputText);
740                pos = UTEXT_GETNATIVEINDEX(fInputText);
741                if (c == theChar) {
742                    MatchAt(startPos, FALSE, fDeferredStatus);
743                    if (U_FAILURE(fDeferredStatus)) {
744                        return FALSE;
745                    }
746                    if (fMatch) {
747                        return TRUE;
748                    }
749                    UTEXT_SETNATIVEINDEX(fInputText, pos);
750                }
751                if (startPos >= testStartLimit) {
752                    fMatch = FALSE;
753                    fHitEnd = TRUE;
754                    return FALSE;
755                }
756                startPos = pos;
757	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
758                    return FALSE;
759           }
760        }
761        U_ASSERT(FALSE);
762
763    case START_LINE:
764        {
765            UChar32  c;
766            if (startPos == fAnchorStart) {
767                MatchAt(startPos, FALSE, fDeferredStatus);
768                if (U_FAILURE(fDeferredStatus)) {
769                    return FALSE;
770                }
771                if (fMatch) {
772                    return TRUE;
773                }
774                UTEXT_SETNATIVEINDEX(fInputText, startPos);
775                c = UTEXT_NEXT32(fInputText);
776                startPos = UTEXT_GETNATIVEINDEX(fInputText);
777            } else {
778                UTEXT_SETNATIVEINDEX(fInputText, startPos);
779                c = UTEXT_PREVIOUS32(fInputText);
780                UTEXT_SETNATIVEINDEX(fInputText, startPos);
781            }
782
783            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
784                for (;;) {
785                    if (c == 0x0a) {
786                            MatchAt(startPos, FALSE, fDeferredStatus);
787                            if (U_FAILURE(fDeferredStatus)) {
788                                return FALSE;
789                            }
790                            if (fMatch) {
791                                return TRUE;
792                            }
793                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
794                    }
795                    if (startPos >= testStartLimit) {
796                        fMatch = FALSE;
797                        fHitEnd = TRUE;
798                        return FALSE;
799                    }
800                    c = UTEXT_NEXT32(fInputText);
801                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
802                    // Note that it's perfectly OK for a pattern to have a zero-length
803                    //   match at the end of a string, so we must make sure that the loop
804                    //   runs with startPos == testStartLimit the last time through.
805		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
806                        return FALSE;
807                }
808            } else {
809                for (;;) {
810                    if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
811                        ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
812                            if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
813                                UTEXT_NEXT32(fInputText);
814                                startPos = UTEXT_GETNATIVEINDEX(fInputText);
815                            }
816                            MatchAt(startPos, FALSE, fDeferredStatus);
817                            if (U_FAILURE(fDeferredStatus)) {
818                                return FALSE;
819                            }
820                            if (fMatch) {
821                                return TRUE;
822                            }
823                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
824                    }
825                    if (startPos >= testStartLimit) {
826                        fMatch = FALSE;
827                        fHitEnd = TRUE;
828                        return FALSE;
829                    }
830                    c = UTEXT_NEXT32(fInputText);
831                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
832                    // Note that it's perfectly OK for a pattern to have a zero-length
833                    //   match at the end of a string, so we must make sure that the loop
834                    //   runs with startPos == testStartLimit the last time through.
835		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
836                        return FALSE;
837                }
838            }
839        }
840
841    default:
842        U_ASSERT(FALSE);
843    }
844
845    U_ASSERT(FALSE);
846    return FALSE;
847}
848
849
850
851UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
852    if (U_FAILURE(status)) {
853        return FALSE;
854    }
855    if (U_FAILURE(fDeferredStatus)) {
856        status = fDeferredStatus;
857        return FALSE;
858    }
859    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
860                                          //        This will reset the region to be the full input length.
861    if (start < 0) {
862        status = U_INDEX_OUTOFBOUNDS_ERROR;
863        return FALSE;
864    }
865
866    int64_t nativeStart = start;
867    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
868        status = U_INDEX_OUTOFBOUNDS_ERROR;
869        return FALSE;
870    }
871    fMatchEnd = nativeStart;
872    return find();
873}
874
875
876//--------------------------------------------------------------------------------
877//
878//   findUsingChunk() -- like find(), but with the advance knowledge that the
879//                       entire string is available in the UText's chunk buffer.
880//
881//--------------------------------------------------------------------------------
882UBool RegexMatcher::findUsingChunk() {
883    // Start at the position of the last match end.  (Will be zero if the
884    //   matcher has been reset.
885    //
886
887    int32_t startPos = (int32_t)fMatchEnd;
888    if (startPos==0) {
889        startPos = (int32_t)fActiveStart;
890    }
891
892    const UChar *inputBuf = fInputText->chunkContents;
893
894    if (fMatch) {
895        // Save the position of any previous successful match.
896        fLastMatchEnd = fMatchEnd;
897
898        if (fMatchStart == fMatchEnd) {
899            // Previous match had zero length.  Move start position up one position
900            //  to avoid sending find() into a loop on zero-length matches.
901            if (startPos >= fActiveLimit) {
902                fMatch = FALSE;
903                fHitEnd = TRUE;
904                return FALSE;
905            }
906            U16_FWD_1(inputBuf, startPos, fInputLength);
907        }
908    } else {
909        if (fLastMatchEnd >= 0) {
910            // A previous find() failed to match.  Don't try again.
911            //   (without this test, a pattern with a zero-length match
912            //    could match again at the end of an input string.)
913            fHitEnd = TRUE;
914            return FALSE;
915        }
916    }
917
918
919    // Compute the position in the input string beyond which a match can not begin, because
920    //   the minimum length match would extend past the end of the input.
921    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
922    //          Be aware of possible overflows if making changes here.
923    int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
924    if (startPos > testLen) {
925        fMatch = FALSE;
926        fHitEnd = TRUE;
927        return FALSE;
928    }
929
930    UChar32  c;
931    U_ASSERT(startPos >= 0);
932
933    switch (fPattern->fStartType) {
934    case START_NO_INFO:
935        // No optimization was found.
936        //  Try a match at each input position.
937        for (;;) {
938            MatchChunkAt(startPos, FALSE, fDeferredStatus);
939            if (U_FAILURE(fDeferredStatus)) {
940                return FALSE;
941            }
942            if (fMatch) {
943                return TRUE;
944            }
945            if (startPos >= testLen) {
946                fHitEnd = TRUE;
947                return FALSE;
948            }
949            U16_FWD_1(inputBuf, startPos, fActiveLimit);
950            // Note that it's perfectly OK for a pattern to have a zero-length
951            //   match at the end of a string, so we must make sure that the loop
952            //   runs with startPos == testLen the last time through.
953            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
954                return FALSE;
955        }
956        U_ASSERT(FALSE);
957
958    case START_START:
959        // Matches are only possible at the start of the input string
960        //   (pattern begins with ^ or \A)
961        if (startPos > fActiveStart) {
962            fMatch = FALSE;
963            return FALSE;
964        }
965        MatchChunkAt(startPos, FALSE, fDeferredStatus);
966        if (U_FAILURE(fDeferredStatus)) {
967            return FALSE;
968        }
969        return fMatch;
970
971
972    case START_SET:
973    {
974        // Match may start on any char from a pre-computed set.
975        U_ASSERT(fPattern->fMinMatchLen > 0);
976        for (;;) {
977            int32_t pos = startPos;
978            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
979            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
980                (c>=256 && fPattern->fInitialChars->contains(c))) {
981                MatchChunkAt(pos, FALSE, fDeferredStatus);
982                if (U_FAILURE(fDeferredStatus)) {
983                    return FALSE;
984                }
985                if (fMatch) {
986                    return TRUE;
987                }
988            }
989            if (pos >= testLen) {
990                fMatch = FALSE;
991                fHitEnd = TRUE;
992                return FALSE;
993            }
994            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
995                return FALSE;
996        }
997    }
998        U_ASSERT(FALSE);
999
1000    case START_STRING:
1001    case START_CHAR:
1002    {
1003        // Match starts on exactly one char.
1004        U_ASSERT(fPattern->fMinMatchLen > 0);
1005        UChar32 theChar = fPattern->fInitialChar;
1006        for (;;) {
1007            int32_t pos = startPos;
1008            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1009            if (c == theChar) {
1010                MatchChunkAt(pos, FALSE, fDeferredStatus);
1011                if (U_FAILURE(fDeferredStatus)) {
1012                    return FALSE;
1013                }
1014                if (fMatch) {
1015                    return TRUE;
1016                }
1017            }
1018            if (pos >= testLen) {
1019                fMatch = FALSE;
1020                fHitEnd = TRUE;
1021                return FALSE;
1022            }
1023            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1024                return FALSE;
1025        }
1026    }
1027        U_ASSERT(FALSE);
1028
1029    case START_LINE:
1030    {
1031        UChar32  c;
1032        if (startPos == fAnchorStart) {
1033            MatchChunkAt(startPos, FALSE, fDeferredStatus);
1034            if (U_FAILURE(fDeferredStatus)) {
1035                return FALSE;
1036            }
1037            if (fMatch) {
1038                return TRUE;
1039            }
1040            U16_FWD_1(inputBuf, startPos, fActiveLimit);
1041        }
1042
1043        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1044            for (;;) {
1045                c = inputBuf[startPos-1];
1046                if (c == 0x0a) {
1047                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1048                    if (U_FAILURE(fDeferredStatus)) {
1049                        return FALSE;
1050                    }
1051                    if (fMatch) {
1052                        return TRUE;
1053                    }
1054                }
1055                if (startPos >= testLen) {
1056                    fMatch = FALSE;
1057                    fHitEnd = TRUE;
1058                    return FALSE;
1059                }
1060                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1061                // Note that it's perfectly OK for a pattern to have a zero-length
1062                //   match at the end of a string, so we must make sure that the loop
1063                //   runs with startPos == testLen the last time through.
1064	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1065                    return FALSE;
1066            }
1067        } else {
1068            for (;;) {
1069                c = inputBuf[startPos-1];
1070                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1071                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1072                    if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1073                        startPos++;
1074                    }
1075                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1076                    if (U_FAILURE(fDeferredStatus)) {
1077                        return FALSE;
1078                    }
1079                    if (fMatch) {
1080                        return TRUE;
1081                    }
1082                }
1083                if (startPos >= testLen) {
1084                    fMatch = FALSE;
1085                    fHitEnd = TRUE;
1086                    return FALSE;
1087                }
1088                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089                // Note that it's perfectly OK for a pattern to have a zero-length
1090                //   match at the end of a string, so we must make sure that the loop
1091                //   runs with startPos == testLen the last time through.
1092	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1093                    return FALSE;
1094            }
1095        }
1096    }
1097
1098    default:
1099        U_ASSERT(FALSE);
1100    }
1101
1102    U_ASSERT(FALSE);
1103    return FALSE;
1104}
1105
1106
1107
1108//--------------------------------------------------------------------------------
1109//
1110//  group()
1111//
1112//--------------------------------------------------------------------------------
1113UnicodeString RegexMatcher::group(UErrorCode &status) const {
1114    return group(0, status);
1115}
1116
1117//  Return immutable shallow clone
1118UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1119    return group(0, dest, group_len, status);
1120}
1121
1122//  Return immutable shallow clone
1123UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1124    group_len = 0;
1125    UBool bailOut = FALSE;
1126    if (U_FAILURE(status)) {
1127        return dest;
1128    }
1129    if (U_FAILURE(fDeferredStatus)) {
1130        status = fDeferredStatus;
1131        bailOut = TRUE;
1132    }
1133    if (fMatch == FALSE) {
1134        status = U_REGEX_INVALID_STATE;
1135        bailOut = TRUE;
1136    }
1137    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1138        status = U_INDEX_OUTOFBOUNDS_ERROR;
1139        bailOut = TRUE;
1140    }
1141
1142    if (bailOut) {
1143        return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1144    }
1145
1146    int64_t s, e;
1147    if (groupNum == 0) {
1148        s = fMatchStart;
1149        e = fMatchEnd;
1150    } else {
1151        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1152        U_ASSERT(groupOffset < fPattern->fFrameSize);
1153        U_ASSERT(groupOffset >= 0);
1154        s = fFrame->fExtra[groupOffset];
1155        e = fFrame->fExtra[groupOffset+1];
1156    }
1157
1158    if (s < 0) {
1159        // A capture group wasn't part of the match
1160        return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1161    }
1162    U_ASSERT(s <= e);
1163    group_len = e - s;
1164
1165    dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1166    if (dest)
1167        UTEXT_SETNATIVEINDEX(dest, s);
1168    return dest;
1169}
1170
1171UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1172    UnicodeString result;
1173    if (U_FAILURE(status)) {
1174        return result;
1175    }
1176    UText resultText = UTEXT_INITIALIZER;
1177    utext_openUnicodeString(&resultText, &result, &status);
1178    group(groupNum, &resultText, status);
1179    utext_close(&resultText);
1180    return result;
1181}
1182
1183
1184//  Return deep (mutable) clone
1185//		Technology Preview (as an API), but note that the UnicodeString API is implemented
1186//		using this function.
1187UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1188    UBool bailOut = FALSE;
1189    if (U_FAILURE(status)) {
1190        return dest;
1191    }
1192    if (U_FAILURE(fDeferredStatus)) {
1193        status = fDeferredStatus;
1194        bailOut = TRUE;
1195    }
1196
1197    if (fMatch == FALSE) {
1198        status = U_REGEX_INVALID_STATE;
1199        bailOut = TRUE;
1200    }
1201    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202        status = U_INDEX_OUTOFBOUNDS_ERROR;
1203        bailOut = TRUE;
1204    }
1205
1206    if (bailOut) {
1207        if (dest) {
1208            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1209            return dest;
1210        } else {
1211            return utext_openUChars(NULL, NULL, 0, &status);
1212        }
1213    }
1214
1215    int64_t s, e;
1216    if (groupNum == 0) {
1217        s = fMatchStart;
1218        e = fMatchEnd;
1219    } else {
1220        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1221        U_ASSERT(groupOffset < fPattern->fFrameSize);
1222        U_ASSERT(groupOffset >= 0);
1223        s = fFrame->fExtra[groupOffset];
1224        e = fFrame->fExtra[groupOffset+1];
1225    }
1226
1227    if (s < 0) {
1228        // A capture group wasn't part of the match
1229        if (dest) {
1230            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1231            return dest;
1232        } else {
1233            return utext_openUChars(NULL, NULL, 0, &status);
1234        }
1235    }
1236    U_ASSERT(s <= e);
1237
1238    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1239        U_ASSERT(e <= fInputLength);
1240        if (dest) {
1241            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1242        } else {
1243            UText groupText = UTEXT_INITIALIZER;
1244            utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1245            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1246            utext_close(&groupText);
1247        }
1248    } else {
1249        int32_t len16;
1250        if (UTEXT_USES_U16(fInputText)) {
1251            len16 = (int32_t)(e-s);
1252        } else {
1253            UErrorCode lengthStatus = U_ZERO_ERROR;
1254            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1255        }
1256        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1257        if (groupChars == NULL) {
1258            status = U_MEMORY_ALLOCATION_ERROR;
1259            return dest;
1260        }
1261        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1262
1263        if (dest) {
1264            utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1265        } else {
1266            UText groupText = UTEXT_INITIALIZER;
1267            utext_openUChars(&groupText, groupChars, len16, &status);
1268            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1269            utext_close(&groupText);
1270        }
1271
1272        uprv_free(groupChars);
1273    }
1274    return dest;
1275}
1276
1277//--------------------------------------------------------------------------------
1278//
1279//  appendGroup() -- currently internal only, appends a group to a UText rather
1280//                   than replacing its contents
1281//
1282//--------------------------------------------------------------------------------
1283
1284int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1285    if (U_FAILURE(status)) {
1286        return 0;
1287    }
1288    if (U_FAILURE(fDeferredStatus)) {
1289        status = fDeferredStatus;
1290        return 0;
1291    }
1292    int64_t destLen = utext_nativeLength(dest);
1293
1294    if (fMatch == FALSE) {
1295        status = U_REGEX_INVALID_STATE;
1296        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297    }
1298    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1299        status = U_INDEX_OUTOFBOUNDS_ERROR;
1300        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1301    }
1302
1303    int64_t s, e;
1304    if (groupNum == 0) {
1305        s = fMatchStart;
1306        e = fMatchEnd;
1307    } else {
1308        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1309        U_ASSERT(groupOffset < fPattern->fFrameSize);
1310        U_ASSERT(groupOffset >= 0);
1311        s = fFrame->fExtra[groupOffset];
1312        e = fFrame->fExtra[groupOffset+1];
1313    }
1314
1315    if (s < 0) {
1316        // A capture group wasn't part of the match
1317        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1318    }
1319    U_ASSERT(s <= e);
1320
1321    int64_t deltaLen;
1322    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1323        U_ASSERT(e <= fInputLength);
1324        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1325    } else {
1326        int32_t len16;
1327        if (UTEXT_USES_U16(fInputText)) {
1328            len16 = (int32_t)(e-s);
1329        } else {
1330            UErrorCode lengthStatus = U_ZERO_ERROR;
1331            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1332        }
1333        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1334        if (groupChars == NULL) {
1335            status = U_MEMORY_ALLOCATION_ERROR;
1336            return 0;
1337        }
1338        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1339
1340        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1341        uprv_free(groupChars);
1342    }
1343    return deltaLen;
1344}
1345
1346
1347
1348//--------------------------------------------------------------------------------
1349//
1350//  groupCount()
1351//
1352//--------------------------------------------------------------------------------
1353int32_t RegexMatcher::groupCount() const {
1354    return fPattern->fGroupMap->size();
1355}
1356
1357
1358
1359//--------------------------------------------------------------------------------
1360//
1361//  hasAnchoringBounds()
1362//
1363//--------------------------------------------------------------------------------
1364UBool RegexMatcher::hasAnchoringBounds() const {
1365    return fAnchoringBounds;
1366}
1367
1368
1369//--------------------------------------------------------------------------------
1370//
1371//  hasTransparentBounds()
1372//
1373//--------------------------------------------------------------------------------
1374UBool RegexMatcher::hasTransparentBounds() const {
1375    return fTransparentBounds;
1376}
1377
1378
1379
1380//--------------------------------------------------------------------------------
1381//
1382//  hitEnd()
1383//
1384//--------------------------------------------------------------------------------
1385UBool RegexMatcher::hitEnd() const {
1386    return fHitEnd;
1387}
1388
1389
1390//--------------------------------------------------------------------------------
1391//
1392//  input()
1393//
1394//--------------------------------------------------------------------------------
1395const UnicodeString &RegexMatcher::input() const {
1396    if (!fInput) {
1397        UErrorCode status = U_ZERO_ERROR;
1398        int32_t len16;
1399        if (UTEXT_USES_U16(fInputText)) {
1400            len16 = (int32_t)fInputLength;
1401        } else {
1402            len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1403            status = U_ZERO_ERROR; // overflow, length status
1404        }
1405        UnicodeString *result = new UnicodeString(len16, 0, 0);
1406
1407        UChar *inputChars = result->getBuffer(len16);
1408        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1409        result->releaseBuffer(len16);
1410
1411        (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1412    }
1413
1414    return *fInput;
1415}
1416
1417//--------------------------------------------------------------------------------
1418//
1419//  inputText()
1420//
1421//--------------------------------------------------------------------------------
1422UText *RegexMatcher::inputText() const {
1423    return fInputText;
1424}
1425
1426
1427//--------------------------------------------------------------------------------
1428//
1429//  getInput() -- like inputText(), but makes a clone or copies into another UText
1430//
1431//--------------------------------------------------------------------------------
1432UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1433    UBool bailOut = FALSE;
1434    if (U_FAILURE(status)) {
1435        return dest;
1436    }
1437    if (U_FAILURE(fDeferredStatus)) {
1438        status = fDeferredStatus;
1439        bailOut = TRUE;
1440    }
1441
1442    if (bailOut) {
1443        if (dest) {
1444            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1445            return dest;
1446        } else {
1447            return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1448        }
1449    }
1450
1451    if (dest) {
1452        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1453            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1454        } else {
1455            int32_t input16Len;
1456            if (UTEXT_USES_U16(fInputText)) {
1457                input16Len = (int32_t)fInputLength;
1458            } else {
1459                UErrorCode lengthStatus = U_ZERO_ERROR;
1460                input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1461            }
1462            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1463            if (inputChars == NULL) {
1464                return dest;
1465            }
1466
1467            status = U_ZERO_ERROR;
1468            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1469            status = U_ZERO_ERROR;
1470            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1471
1472            uprv_free(inputChars);
1473        }
1474        return dest;
1475    } else {
1476        return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1477    }
1478}
1479
1480
1481static UBool compat_SyncMutableUTextContents(UText *ut);
1482static UBool compat_SyncMutableUTextContents(UText *ut) {
1483    UBool retVal = FALSE;
1484
1485    //  In the following test, we're really only interested in whether the UText should switch
1486    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1487    //  will still point to the correct data.
1488    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1489        UnicodeString *us=(UnicodeString *)ut->context;
1490
1491        // Update to the latest length.
1492        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493        int32_t newLength = us->length();
1494
1495        // Update the chunk description.
1496        // The buffer may have switched between stack- and heap-based.
1497        ut->chunkContents    = us->getBuffer();
1498        ut->chunkLength      = newLength;
1499        ut->chunkNativeLimit = newLength;
1500        ut->nativeIndexingLimit = newLength;
1501        retVal = TRUE;
1502    }
1503
1504    return retVal;
1505}
1506
1507//--------------------------------------------------------------------------------
1508//
1509//  lookingAt()
1510//
1511//--------------------------------------------------------------------------------
1512UBool RegexMatcher::lookingAt(UErrorCode &status) {
1513    if (U_FAILURE(status)) {
1514        return FALSE;
1515    }
1516    if (U_FAILURE(fDeferredStatus)) {
1517        status = fDeferredStatus;
1518        return FALSE;
1519    }
1520
1521    if (fInputUniStrMaybeMutable) {
1522        if (compat_SyncMutableUTextContents(fInputText)) {
1523        fInputLength = utext_nativeLength(fInputText);
1524        reset();
1525        }
1526    }
1527    else {
1528        resetPreserveRegion();
1529    }
1530    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1531        MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1532    } else {
1533        MatchAt(fActiveStart, FALSE, status);
1534    }
1535    return fMatch;
1536}
1537
1538
1539UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1540    if (U_FAILURE(status)) {
1541        return FALSE;
1542    }
1543    if (U_FAILURE(fDeferredStatus)) {
1544        status = fDeferredStatus;
1545        return FALSE;
1546    }
1547    reset();
1548
1549    if (start < 0) {
1550        status = U_INDEX_OUTOFBOUNDS_ERROR;
1551        return FALSE;
1552    }
1553
1554    if (fInputUniStrMaybeMutable) {
1555        if (compat_SyncMutableUTextContents(fInputText)) {
1556        fInputLength = utext_nativeLength(fInputText);
1557        reset();
1558        }
1559    }
1560
1561    int64_t nativeStart;
1562    nativeStart = start;
1563    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1564        status = U_INDEX_OUTOFBOUNDS_ERROR;
1565        return FALSE;
1566    }
1567
1568    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1569        MatchChunkAt((int32_t)nativeStart, FALSE, status);
1570    } else {
1571        MatchAt(nativeStart, FALSE, status);
1572    }
1573    return fMatch;
1574}
1575
1576
1577
1578//--------------------------------------------------------------------------------
1579//
1580//  matches()
1581//
1582//--------------------------------------------------------------------------------
1583UBool RegexMatcher::matches(UErrorCode &status) {
1584    if (U_FAILURE(status)) {
1585        return FALSE;
1586    }
1587    if (U_FAILURE(fDeferredStatus)) {
1588        status = fDeferredStatus;
1589        return FALSE;
1590    }
1591
1592    if (fInputUniStrMaybeMutable) {
1593        if (compat_SyncMutableUTextContents(fInputText)) {
1594        fInputLength = utext_nativeLength(fInputText);
1595        reset();
1596        }
1597    }
1598    else {
1599        resetPreserveRegion();
1600    }
1601
1602    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1603        MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1604    } else {
1605        MatchAt(fActiveStart, TRUE, status);
1606    }
1607    return fMatch;
1608}
1609
1610
1611UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1612    if (U_FAILURE(status)) {
1613        return FALSE;
1614    }
1615    if (U_FAILURE(fDeferredStatus)) {
1616        status = fDeferredStatus;
1617        return FALSE;
1618    }
1619    reset();
1620
1621    if (start < 0) {
1622        status = U_INDEX_OUTOFBOUNDS_ERROR;
1623        return FALSE;
1624    }
1625
1626    if (fInputUniStrMaybeMutable) {
1627        if (compat_SyncMutableUTextContents(fInputText)) {
1628        fInputLength = utext_nativeLength(fInputText);
1629        reset();
1630        }
1631    }
1632
1633    int64_t nativeStart;
1634    nativeStart = start;
1635    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1636        status = U_INDEX_OUTOFBOUNDS_ERROR;
1637        return FALSE;
1638    }
1639
1640    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1641        MatchChunkAt((int32_t)nativeStart, TRUE, status);
1642    } else {
1643        MatchAt(nativeStart, TRUE, status);
1644    }
1645    return fMatch;
1646}
1647
1648
1649
1650//--------------------------------------------------------------------------------
1651//
1652//    pattern
1653//
1654//--------------------------------------------------------------------------------
1655const RegexPattern &RegexMatcher::pattern() const {
1656    return *fPattern;
1657}
1658
1659
1660
1661//--------------------------------------------------------------------------------
1662//
1663//    region
1664//
1665//--------------------------------------------------------------------------------
1666RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1667    if (U_FAILURE(status)) {
1668        return *this;
1669    }
1670
1671    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1672        status = U_ILLEGAL_ARGUMENT_ERROR;
1673    }
1674
1675    int64_t nativeStart = regionStart;
1676    int64_t nativeLimit = regionLimit;
1677    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1678      status = U_ILLEGAL_ARGUMENT_ERROR;
1679    }
1680
1681    if (startIndex == -1)
1682      this->reset();
1683    else
1684      resetPreserveRegion();
1685
1686    fRegionStart = nativeStart;
1687    fRegionLimit = nativeLimit;
1688    fActiveStart = nativeStart;
1689    fActiveLimit = nativeLimit;
1690
1691    if (startIndex != -1) {
1692      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1693          status = U_INDEX_OUTOFBOUNDS_ERROR;
1694      }
1695      fMatchEnd = startIndex;
1696    }
1697
1698    if (!fTransparentBounds) {
1699        fLookStart = nativeStart;
1700        fLookLimit = nativeLimit;
1701    }
1702    if (fAnchoringBounds) {
1703        fAnchorStart = nativeStart;
1704        fAnchorLimit = nativeLimit;
1705    }
1706    return *this;
1707}
1708
1709RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1710  return region(start, limit, -1, status);
1711}
1712
1713//--------------------------------------------------------------------------------
1714//
1715//    regionEnd
1716//
1717//--------------------------------------------------------------------------------
1718int32_t RegexMatcher::regionEnd() const {
1719    return (int32_t)fRegionLimit;
1720}
1721
1722int64_t RegexMatcher::regionEnd64() const {
1723    return fRegionLimit;
1724}
1725
1726//--------------------------------------------------------------------------------
1727//
1728//    regionStart
1729//
1730//--------------------------------------------------------------------------------
1731int32_t RegexMatcher::regionStart() const {
1732    return (int32_t)fRegionStart;
1733}
1734
1735int64_t RegexMatcher::regionStart64() const {
1736    return fRegionStart;
1737}
1738
1739
1740//--------------------------------------------------------------------------------
1741//
1742//    replaceAll
1743//
1744//--------------------------------------------------------------------------------
1745UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1746    UText replacementText = UTEXT_INITIALIZER;
1747    UText resultText = UTEXT_INITIALIZER;
1748    UnicodeString resultString;
1749    if (U_FAILURE(status)) {
1750        return resultString;
1751    }
1752
1753    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754    utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756    replaceAll(&replacementText, &resultText, status);
1757
1758    utext_close(&resultText);
1759    utext_close(&replacementText);
1760
1761    return resultString;
1762}
1763
1764
1765//
1766//    replaceAll, UText mode
1767//
1768UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1769    if (U_FAILURE(status)) {
1770        return dest;
1771    }
1772    if (U_FAILURE(fDeferredStatus)) {
1773        status = fDeferredStatus;
1774        return dest;
1775    }
1776
1777    if (dest == NULL) {
1778        UnicodeString emptyString;
1779        UText empty = UTEXT_INITIALIZER;
1780
1781        utext_openUnicodeString(&empty, &emptyString, &status);
1782        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783        utext_close(&empty);
1784    }
1785
1786    if (U_SUCCESS(status)) {
1787        reset();
1788        while (find()) {
1789            appendReplacement(dest, replacement, status);
1790            if (U_FAILURE(status)) {
1791                break;
1792            }
1793        }
1794        appendTail(dest, status);
1795    }
1796
1797    return dest;
1798}
1799
1800
1801//--------------------------------------------------------------------------------
1802//
1803//    replaceFirst
1804//
1805//--------------------------------------------------------------------------------
1806UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1807    UText replacementText = UTEXT_INITIALIZER;
1808    UText resultText = UTEXT_INITIALIZER;
1809    UnicodeString resultString;
1810
1811    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1812    utext_openUnicodeString(&resultText, &resultString, &status);
1813
1814    replaceFirst(&replacementText, &resultText, status);
1815
1816    utext_close(&resultText);
1817    utext_close(&replacementText);
1818
1819    return resultString;
1820}
1821
1822//
1823//    replaceFirst, UText mode
1824//
1825UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1826    if (U_FAILURE(status)) {
1827        return dest;
1828    }
1829    if (U_FAILURE(fDeferredStatus)) {
1830        status = fDeferredStatus;
1831        return dest;
1832    }
1833
1834    reset();
1835    if (!find()) {
1836        return getInput(dest, status);
1837    }
1838
1839    if (dest == NULL) {
1840        UnicodeString emptyString;
1841        UText empty = UTEXT_INITIALIZER;
1842
1843        utext_openUnicodeString(&empty, &emptyString, &status);
1844        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1845        utext_close(&empty);
1846    }
1847
1848    appendReplacement(dest, replacement, status);
1849    appendTail(dest, status);
1850
1851    return dest;
1852}
1853
1854
1855//--------------------------------------------------------------------------------
1856//
1857//     requireEnd
1858//
1859//--------------------------------------------------------------------------------
1860UBool RegexMatcher::requireEnd() const {
1861    return fRequireEnd;
1862}
1863
1864
1865//--------------------------------------------------------------------------------
1866//
1867//     reset
1868//
1869//--------------------------------------------------------------------------------
1870RegexMatcher &RegexMatcher::reset() {
1871    fRegionStart    = 0;
1872    fRegionLimit    = fInputLength;
1873    fActiveStart    = 0;
1874    fActiveLimit    = fInputLength;
1875    fAnchorStart    = 0;
1876    fAnchorLimit    = fInputLength;
1877    fLookStart      = 0;
1878    fLookLimit      = fInputLength;
1879    resetPreserveRegion();
1880    return *this;
1881}
1882
1883
1884
1885void RegexMatcher::resetPreserveRegion() {
1886    fMatchStart     = 0;
1887    fMatchEnd       = 0;
1888    fLastMatchEnd   = -1;
1889    fAppendPosition = 0;
1890    fMatch          = FALSE;
1891    fHitEnd         = FALSE;
1892    fRequireEnd     = FALSE;
1893    fTime           = 0;
1894    fTickCounter    = TIMER_INITIAL_VALUE;
1895    //resetStack(); // more expensive than it looks...
1896}
1897
1898
1899RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1900    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1901    if (fPattern->fNeedsAltInput) {
1902        fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1903    }
1904    fInputLength = utext_nativeLength(fInputText);
1905
1906    reset();
1907    delete fInput;
1908    fInput = NULL;
1909
1910    //  Do the following for any UnicodeString.
1911    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1912    fInputUniStrMaybeMutable = TRUE;
1913
1914    if (fWordBreakItr != NULL) {
1915#if UCONFIG_NO_BREAK_ITERATION==0
1916        UErrorCode status = U_ZERO_ERROR;
1917        fWordBreakItr->setText(fInputText, status);
1918#endif
1919    }
1920    return *this;
1921}
1922
1923
1924RegexMatcher &RegexMatcher::reset(UText *input) {
1925    if (fInputText != input) {
1926        fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1927        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1928        fInputLength = utext_nativeLength(fInputText);
1929
1930        delete fInput;
1931        fInput = NULL;
1932
1933        if (fWordBreakItr != NULL) {
1934#if UCONFIG_NO_BREAK_ITERATION==0
1935            UErrorCode status = U_ZERO_ERROR;
1936            fWordBreakItr->setText(input, status);
1937#endif
1938        }
1939    }
1940    reset();
1941    fInputUniStrMaybeMutable = FALSE;
1942
1943    return *this;
1944}
1945
1946/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1948    return *this;
1949}*/
1950
1951RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1952    if (U_FAILURE(status)) {
1953        return *this;
1954    }
1955    reset();       // Reset also resets the region to be the entire string.
1956
1957    if (position < 0 || position > fActiveLimit) {
1958        status = U_INDEX_OUTOFBOUNDS_ERROR;
1959        return *this;
1960    }
1961    fMatchEnd = position;
1962    return *this;
1963}
1964
1965
1966
1967
1968
1969//--------------------------------------------------------------------------------
1970//
1971//    setTrace
1972//
1973//--------------------------------------------------------------------------------
1974void RegexMatcher::setTrace(UBool state) {
1975    fTraceDebug = state;
1976}
1977
1978
1979
1980//---------------------------------------------------------------------
1981//
1982//   split
1983//
1984//---------------------------------------------------------------------
1985int32_t  RegexMatcher::split(const UnicodeString &input,
1986        UnicodeString    dest[],
1987        int32_t          destCapacity,
1988        UErrorCode      &status)
1989{
1990    UText inputText = UTEXT_INITIALIZER;
1991    utext_openConstUnicodeString(&inputText, &input, &status);
1992    if (U_FAILURE(status)) {
1993        return 0;
1994    }
1995
1996    UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
1997    if (destText == NULL) {
1998        status = U_MEMORY_ALLOCATION_ERROR;
1999        return 0;
2000    }
2001    int32_t i;
2002    for (i = 0; i < destCapacity; i++) {
2003        destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2004    }
2005
2006    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2007
2008    for (i = 0; i < destCapacity; i++) {
2009        utext_close(destText[i]);
2010    }
2011
2012    uprv_free(destText);
2013    utext_close(&inputText);
2014    return fieldCount;
2015}
2016
2017//
2018//   split, UText mode
2019//
2020int32_t  RegexMatcher::split(UText *input,
2021        UText           *dest[],
2022        int32_t          destCapacity,
2023        UErrorCode      &status)
2024{
2025    //
2026    // Check arguements for validity
2027    //
2028    if (U_FAILURE(status)) {
2029        return 0;
2030    };
2031
2032    if (destCapacity < 1) {
2033        status = U_ILLEGAL_ARGUMENT_ERROR;
2034        return 0;
2035    }
2036
2037    //
2038    // Reset for the input text
2039    //
2040    reset(input);
2041    int64_t   nextOutputStringStart = 0;
2042    if (fActiveLimit == 0) {
2043        return 0;
2044    }
2045
2046    //
2047    // Loop through the input text, searching for the delimiter pattern
2048    //
2049    int32_t i;
2050    int32_t numCaptureGroups = fPattern->fGroupMap->size();
2051    for (i=0; ; i++) {
2052        if (i>=destCapacity-1) {
2053            // There is one or zero output string left.
2054            // Fill the last output string with whatever is left from the input, then exit the loop.
2055            //  ( i will be == destCapacity if we filled the output array while processing
2056            //    capture groups of the delimiter expression, in which case we will discard the
2057            //    last capture group saved in favor of the unprocessed remainder of the
2058            //    input string.)
2059            i = destCapacity-1;
2060            if (fActiveLimit > nextOutputStringStart) {
2061                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2062                    if (dest[i]) {
2063                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2064                                      input->chunkContents+nextOutputStringStart,
2065                                      (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2066                    } else {
2067                        UText remainingText = UTEXT_INITIALIZER;
2068                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2069                                         fActiveLimit-nextOutputStringStart, &status);
2070                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2071                        utext_close(&remainingText);
2072                    }
2073                } else {
2074                    UErrorCode lengthStatus = U_ZERO_ERROR;
2075                    int32_t remaining16Length =
2076                        utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2077                    UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2078                    if (remainingChars == NULL) {
2079                        status = U_MEMORY_ALLOCATION_ERROR;
2080                        break;
2081                    }
2082
2083                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2084                    if (dest[i]) {
2085                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2086                    } else {
2087                        UText remainingText = UTEXT_INITIALIZER;
2088                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2089                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2090                        utext_close(&remainingText);
2091                    }
2092
2093                    uprv_free(remainingChars);
2094                }
2095            }
2096            break;
2097        }
2098        if (find()) {
2099            // We found another delimiter.  Move everything from where we started looking
2100            //  up until the start of the delimiter into the next output string.
2101            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2102                if (dest[i]) {
2103                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2104                                  input->chunkContents+nextOutputStringStart,
2105                                  (int32_t)(fMatchStart-nextOutputStringStart), &status);
2106                } else {
2107                    UText remainingText = UTEXT_INITIALIZER;
2108                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2109                                      fMatchStart-nextOutputStringStart, &status);
2110                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2111                    utext_close(&remainingText);
2112                }
2113            } else {
2114                UErrorCode lengthStatus = U_ZERO_ERROR;
2115                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2116                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2117                if (remainingChars == NULL) {
2118                    status = U_MEMORY_ALLOCATION_ERROR;
2119                    break;
2120                }
2121                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2122                if (dest[i]) {
2123                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2124                } else {
2125                    UText remainingText = UTEXT_INITIALIZER;
2126                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2127                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2128                    utext_close(&remainingText);
2129                }
2130
2131                uprv_free(remainingChars);
2132            }
2133            nextOutputStringStart = fMatchEnd;
2134
2135            // If the delimiter pattern has capturing parentheses, the captured
2136            //  text goes out into the next n destination strings.
2137            int32_t groupNum;
2138            UBool lastGroupWasNullUText = FALSE;
2139            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2140                if (i==destCapacity-1) {
2141                    break;
2142                }
2143                i++;
2144                lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
2145                dest[i] = group(groupNum, dest[i], status);
2146            }
2147
2148            if (nextOutputStringStart == fActiveLimit) {
2149                // The delimiter was at the end of the string.  We're done.
2150                break;
2151            } else if (i == destCapacity-1) {
2152                // We're out of capture groups, and the rest of the string is more important
2153                if (lastGroupWasNullUText) {
2154                    utext_close(dest[i]);
2155                    dest[i] = NULL;
2156                }
2157            }
2158
2159        }
2160        else
2161        {
2162            // We ran off the end of the input while looking for the next delimiter.
2163            // All the remaining text goes into the current output string.
2164            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2165                if (dest[i]) {
2166                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2167                                  input->chunkContents+nextOutputStringStart,
2168                                  (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2169                } else {
2170                    UText remainingText = UTEXT_INITIALIZER;
2171                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2172                                     fActiveLimit-nextOutputStringStart, &status);
2173                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2174                    utext_close(&remainingText);
2175                }
2176            } else {
2177                UErrorCode lengthStatus = U_ZERO_ERROR;
2178                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2179                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2180                if (remainingChars == NULL) {
2181                    status = U_MEMORY_ALLOCATION_ERROR;
2182                    break;
2183                }
2184
2185                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2186                if (dest[i]) {
2187                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2188                } else {
2189                    UText remainingText = UTEXT_INITIALIZER;
2190                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2191                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2192                    utext_close(&remainingText);
2193                }
2194
2195                uprv_free(remainingChars);
2196            }
2197            break;
2198        }
2199        if (U_FAILURE(status)) {
2200            break;
2201        }
2202    }   // end of for loop
2203    return i+1;
2204}
2205
2206
2207//--------------------------------------------------------------------------------
2208//
2209//     start
2210//
2211//--------------------------------------------------------------------------------
2212int32_t RegexMatcher::start(UErrorCode &status) const {
2213    return start(0, status);
2214}
2215
2216int64_t RegexMatcher::start64(UErrorCode &status) const {
2217    return start64(0, status);
2218}
2219
2220//--------------------------------------------------------------------------------
2221//
2222//     start(int32_t group, UErrorCode &status)
2223//
2224//--------------------------------------------------------------------------------
2225
2226int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2227    if (U_FAILURE(status)) {
2228        return -1;
2229    }
2230    if (U_FAILURE(fDeferredStatus)) {
2231        status = fDeferredStatus;
2232        return -1;
2233    }
2234    if (fMatch == FALSE) {
2235        status = U_REGEX_INVALID_STATE;
2236        return -1;
2237    }
2238    if (group < 0 || group > fPattern->fGroupMap->size()) {
2239        status = U_INDEX_OUTOFBOUNDS_ERROR;
2240        return -1;
2241    }
2242    int64_t s;
2243    if (group == 0) {
2244        s = fMatchStart;
2245    } else {
2246        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2247        U_ASSERT(groupOffset < fPattern->fFrameSize);
2248        U_ASSERT(groupOffset >= 0);
2249        s = fFrame->fExtra[groupOffset];
2250    }
2251
2252    return s;
2253}
2254
2255
2256int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2257    return (int32_t)start64(group, status);
2258}
2259
2260//--------------------------------------------------------------------------------
2261//
2262//     useAnchoringBounds
2263//
2264//--------------------------------------------------------------------------------
2265RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2266    fAnchoringBounds = b;
2267    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2268    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2269    return *this;
2270}
2271
2272
2273//--------------------------------------------------------------------------------
2274//
2275//     useTransparentBounds
2276//
2277//--------------------------------------------------------------------------------
2278RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2279    fTransparentBounds = b;
2280    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2281    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2282    return *this;
2283}
2284
2285//--------------------------------------------------------------------------------
2286//
2287//     setTimeLimit
2288//
2289//--------------------------------------------------------------------------------
2290void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2291    if (U_FAILURE(status)) {
2292        return;
2293    }
2294    if (U_FAILURE(fDeferredStatus)) {
2295        status = fDeferredStatus;
2296        return;
2297    }
2298    if (limit < 0) {
2299        status = U_ILLEGAL_ARGUMENT_ERROR;
2300        return;
2301    }
2302    fTimeLimit = limit;
2303}
2304
2305
2306//--------------------------------------------------------------------------------
2307//
2308//     getTimeLimit
2309//
2310//--------------------------------------------------------------------------------
2311int32_t RegexMatcher::getTimeLimit() const {
2312    return fTimeLimit;
2313}
2314
2315
2316//--------------------------------------------------------------------------------
2317//
2318//     setStackLimit
2319//
2320//--------------------------------------------------------------------------------
2321void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2322    if (U_FAILURE(status)) {
2323        return;
2324    }
2325    if (U_FAILURE(fDeferredStatus)) {
2326        status = fDeferredStatus;
2327        return;
2328    }
2329    if (limit < 0) {
2330        status = U_ILLEGAL_ARGUMENT_ERROR;
2331        return;
2332    }
2333
2334    // Reset the matcher.  This is needed here in case there is a current match
2335    //    whose final stack frame (containing the match results, pointed to by fFrame)
2336    //    would be lost by resizing to a smaller stack size.
2337    reset();
2338
2339    if (limit == 0) {
2340        // Unlimited stack expansion
2341        fStack->setMaxCapacity(0);
2342    } else {
2343        // Change the units of the limit  from bytes to ints, and bump the size up
2344        //   to be big enough to hold at least one stack frame for the pattern,
2345        //   if it isn't there already.
2346        int32_t adjustedLimit = limit / sizeof(int32_t);
2347        if (adjustedLimit < fPattern->fFrameSize) {
2348            adjustedLimit = fPattern->fFrameSize;
2349        }
2350        fStack->setMaxCapacity(adjustedLimit);
2351    }
2352    fStackLimit = limit;
2353}
2354
2355
2356//--------------------------------------------------------------------------------
2357//
2358//     getStackLimit
2359//
2360//--------------------------------------------------------------------------------
2361int32_t RegexMatcher::getStackLimit() const {
2362    return fStackLimit;
2363}
2364
2365
2366//--------------------------------------------------------------------------------
2367//
2368//     setMatchCallback
2369//
2370//--------------------------------------------------------------------------------
2371void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2372                                    const void              *context,
2373                                    UErrorCode              &status) {
2374    if (U_FAILURE(status)) {
2375        return;
2376    }
2377    fCallbackFn = callback;
2378    fCallbackContext = context;
2379}
2380
2381
2382//--------------------------------------------------------------------------------
2383//
2384//     getMatchCallback
2385//
2386//--------------------------------------------------------------------------------
2387void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2388                                  const void              *&context,
2389                                  UErrorCode              &status) {
2390    if (U_FAILURE(status)) {
2391       return;
2392    }
2393    callback = fCallbackFn;
2394    context  = fCallbackContext;
2395}
2396
2397
2398//--------------------------------------------------------------------------------
2399//
2400//     setMatchCallback
2401//
2402//--------------------------------------------------------------------------------
2403void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2404                                                const void                      *context,
2405                                                UErrorCode                      &status) {
2406    if (U_FAILURE(status)) {
2407        return;
2408    }
2409    fFindProgressCallbackFn = callback;
2410    fFindProgressCallbackContext = context;
2411}
2412
2413
2414//--------------------------------------------------------------------------------
2415//
2416//     getMatchCallback
2417//
2418//--------------------------------------------------------------------------------
2419void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2420                                                const void                    *&context,
2421                                                UErrorCode                    &status) {
2422    if (U_FAILURE(status)) {
2423       return;
2424    }
2425    callback = fFindProgressCallbackFn;
2426    context  = fFindProgressCallbackContext;
2427}
2428
2429
2430//================================================================================
2431//
2432//    Code following this point in this file is the internal
2433//    Match Engine Implementation.
2434//
2435//================================================================================
2436
2437
2438//--------------------------------------------------------------------------------
2439//
2440//   resetStack
2441//           Discard any previous contents of the state save stack, and initialize a
2442//           new stack frame to all -1.  The -1s are needed for capture group limits,
2443//           where they indicate that a group has not yet matched anything.
2444//--------------------------------------------------------------------------------
2445REStackFrame *RegexMatcher::resetStack() {
2446    // Discard any previous contents of the state save stack, and initialize a
2447    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2448    //  where they indicate that a group has not yet matched anything.
2449    fStack->removeAllElements();
2450
2451    REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2452    int32_t i;
2453    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2454        iFrame->fExtra[i] = -1;
2455    }
2456    return iFrame;
2457}
2458
2459
2460
2461//--------------------------------------------------------------------------------
2462//
2463//   isWordBoundary
2464//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2465//                     For us,
2466//                       If the current char is a combining mark,
2467//                          \b is FALSE.
2468//                       Else Scan backwards to the first non-combining char.
2469//                            We are at a boundary if the this char and the original chars are
2470//                               opposite in membership in \w set
2471//
2472//          parameters:   pos   - the current position in the input buffer
2473//
2474//              TODO:  double-check edge cases at region boundaries.
2475//
2476//--------------------------------------------------------------------------------
2477UBool RegexMatcher::isWordBoundary(int64_t pos) {
2478    UBool isBoundary = FALSE;
2479    UBool cIsWord    = FALSE;
2480
2481    if (pos >= fLookLimit) {
2482        fHitEnd = TRUE;
2483    } else {
2484        // Determine whether char c at current position is a member of the word set of chars.
2485        // If we're off the end of the string, behave as though we're not at a word char.
2486        UTEXT_SETNATIVEINDEX(fInputText, pos);
2487        UChar32  c = UTEXT_CURRENT32(fInputText);
2488        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2489            // Current char is a combining one.  Not a boundary.
2490            return FALSE;
2491        }
2492        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2493    }
2494
2495    // Back up until we come to a non-combining char, determine whether
2496    //  that char is a word char.
2497    UBool prevCIsWord = FALSE;
2498    for (;;) {
2499        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2500            break;
2501        }
2502        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2503        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2504              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2505            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2506            break;
2507        }
2508    }
2509    isBoundary = cIsWord ^ prevCIsWord;
2510    return isBoundary;
2511}
2512
2513UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2514    UBool isBoundary = FALSE;
2515    UBool cIsWord    = FALSE;
2516
2517    const UChar *inputBuf = fInputText->chunkContents;
2518
2519    if (pos >= fLookLimit) {
2520        fHitEnd = TRUE;
2521    } else {
2522        // Determine whether char c at current position is a member of the word set of chars.
2523        // If we're off the end of the string, behave as though we're not at a word char.
2524        UChar32 c;
2525        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2526        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2527            // Current char is a combining one.  Not a boundary.
2528            return FALSE;
2529        }
2530        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2531    }
2532
2533    // Back up until we come to a non-combining char, determine whether
2534    //  that char is a word char.
2535    UBool prevCIsWord = FALSE;
2536    for (;;) {
2537        if (pos <= fLookStart) {
2538            break;
2539        }
2540        UChar32 prevChar;
2541        U16_PREV(inputBuf, fLookStart, pos, prevChar);
2542        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2543              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2544            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2545            break;
2546        }
2547    }
2548    isBoundary = cIsWord ^ prevCIsWord;
2549    return isBoundary;
2550}
2551
2552//--------------------------------------------------------------------------------
2553//
2554//   isUWordBoundary
2555//
2556//         Test for a word boundary using RBBI word break.
2557//
2558//          parameters:   pos   - the current position in the input buffer
2559//
2560//--------------------------------------------------------------------------------
2561UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2562    UBool       returnVal = FALSE;
2563#if UCONFIG_NO_BREAK_ITERATION==0
2564
2565    // If we haven't yet created a break iterator for this matcher, do it now.
2566    if (fWordBreakItr == NULL) {
2567        fWordBreakItr =
2568            (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2569        if (U_FAILURE(fDeferredStatus)) {
2570            return FALSE;
2571        }
2572        fWordBreakItr->setText(fInputText, fDeferredStatus);
2573    }
2574
2575    if (pos >= fLookLimit) {
2576        fHitEnd = TRUE;
2577        returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2578                            //    words are not boundaries.  All non-word chars stand by themselves,
2579                            //    with word boundaries on both sides.
2580    } else {
2581        if (!UTEXT_USES_U16(fInputText)) {
2582            // !!!: Would like a better way to do this!
2583            UErrorCode status = U_ZERO_ERROR;
2584            pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2585        }
2586        returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2587    }
2588#endif
2589    return   returnVal;
2590}
2591
2592//--------------------------------------------------------------------------------
2593//
2594//   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2595//                     saves. Increment the "time" counter, and call the
2596//                     user callback function if there is one installed.
2597//
2598//                     If the match operation needs to be aborted, either for a time-out
2599//                     or because the user callback asked for it, just set an error status.
2600//                     The engine will pick that up and stop in its outer loop.
2601//
2602//--------------------------------------------------------------------------------
2603void RegexMatcher::IncrementTime(UErrorCode &status) {
2604    fTickCounter = TIMER_INITIAL_VALUE;
2605    fTime++;
2606    if (fCallbackFn != NULL) {
2607        if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2608            status = U_REGEX_STOPPED_BY_CALLER;
2609            return;
2610        }
2611    }
2612    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2613        status = U_REGEX_TIME_OUT;
2614    }
2615}
2616
2617//--------------------------------------------------------------------------------
2618//
2619//   ReportFindProgress     This function is called once for each advance in the target
2620//                          string from the find() function, and calls the user progress callback
2621//                          function if there is one installed.
2622//
2623//                          NOTE:
2624//
2625//                          If the match operation needs to be aborted because the user
2626//                          callback asked for it, just set an error status.
2627//                          The engine will pick that up and stop in its outer loop.
2628//
2629//--------------------------------------------------------------------------------
2630UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2631    if (fFindProgressCallbackFn != NULL) {
2632        if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2633            status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2634            return FALSE;
2635        }
2636    }
2637    return TRUE;
2638}
2639
2640//--------------------------------------------------------------------------------
2641//
2642//   StateSave
2643//       Make a new stack frame, initialized as a copy of the current stack frame.
2644//       Set the pattern index in the original stack frame from the operand value
2645//       in the opcode.  Execution of the engine continues with the state in
2646//       the newly created stack frame
2647//
2648//       Note that reserveBlock() may grow the stack, resulting in the
2649//       whole thing being relocated in memory.
2650//
2651//    Parameters:
2652//       fp           The top frame pointer when called.  At return, a new
2653//                    fame will be present
2654//       savePatIdx   An index into the compiled pattern.  Goes into the original
2655//                    (not new) frame.  If execution ever back-tracks out of the
2656//                    new frame, this will be where we continue from in the pattern.
2657//    Return
2658//                    The new frame pointer.
2659//
2660//--------------------------------------------------------------------------------
2661inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2662    // push storage for a new frame.
2663    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2664    if (newFP == NULL) {
2665        // Failure on attempted stack expansion.
2666        //   Stack function set some other error code, change it to a more
2667        //   specific one for regular expressions.
2668        status = U_REGEX_STACK_OVERFLOW;
2669        // We need to return a writable stack frame, so just return the
2670        //    previous frame.  The match operation will stop quickly
2671        //    because of the error status, after which the frame will never
2672        //    be looked at again.
2673        return fp;
2674    }
2675    fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2676
2677    // New stack frame = copy of old top frame.
2678    int64_t *source = (int64_t *)fp;
2679    int64_t *dest   = newFP;
2680    for (;;) {
2681        *dest++ = *source++;
2682        if (source == newFP) {
2683            break;
2684        }
2685    }
2686
2687    fTickCounter--;
2688    if (fTickCounter <= 0) {
2689       IncrementTime(status);    // Re-initializes fTickCounter
2690    }
2691    fp->fPatIdx = savePatIdx;
2692    return (REStackFrame *)newFP;
2693}
2694
2695
2696//--------------------------------------------------------------------------------
2697//
2698//   MatchAt      This is the actual matching engine.
2699//
2700//                  startIdx:    begin matching a this index.
2701//                  toEnd:       if true, match must extend to end of the input region
2702//
2703//--------------------------------------------------------------------------------
2704void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2705    UBool       isMatch  = FALSE;      // True if the we have a match.
2706
2707    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2708
2709    int32_t     op;                    // Operation from the compiled pattern, split into
2710    int32_t     opType;                //    the opcode
2711    int32_t     opValue;               //    and the operand value.
2712
2713    #ifdef REGEX_RUN_DEBUG
2714    if (fTraceDebug)
2715    {
2716        printf("MatchAt(startIdx=%ld)\n", startIdx);
2717        printf("Original Pattern: ");
2718        UChar32 c = utext_next32From(fPattern->fPattern, 0);
2719        while (c != U_SENTINEL) {
2720            if (c<32 || c>256) {
2721                c = '.';
2722            }
2723            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2724
2725            c = UTEXT_NEXT32(fPattern->fPattern);
2726        }
2727        printf("\n");
2728        printf("Input String: ");
2729        c = utext_next32From(fInputText, 0);
2730        while (c != U_SENTINEL) {
2731            if (c<32 || c>256) {
2732                c = '.';
2733            }
2734            printf("%c", c);
2735
2736            c = UTEXT_NEXT32(fInputText);
2737        }
2738        printf("\n");
2739        printf("\n");
2740    }
2741    #endif
2742
2743    if (U_FAILURE(status)) {
2744        return;
2745    }
2746
2747    //  Cache frequently referenced items from the compiled pattern
2748    //
2749    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2750
2751    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2752    UVector             *sets          = fPattern->fSets;
2753
2754    fFrameSize = fPattern->fFrameSize;
2755    REStackFrame        *fp            = resetStack();
2756
2757    fp->fPatIdx   = 0;
2758    fp->fInputIdx = startIdx;
2759
2760    // Zero out the pattern's static data
2761    int32_t i;
2762    for (i = 0; i<fPattern->fDataSize; i++) {
2763        fData[i] = 0;
2764    }
2765
2766    //
2767    //  Main loop for interpreting the compiled pattern.
2768    //  One iteration of the loop per pattern operation performed.
2769    //
2770    for (;;) {
2771#if 0
2772        if (_heapchk() != _HEAPOK) {
2773            fprintf(stderr, "Heap Trouble\n");
2774        }
2775#endif
2776
2777        op      = (int32_t)pat[fp->fPatIdx];
2778        opType  = URX_TYPE(op);
2779        opValue = URX_VAL(op);
2780        #ifdef REGEX_RUN_DEBUG
2781        if (fTraceDebug) {
2782            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2783            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
2784                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2785            fPattern->dumpOp(fp->fPatIdx);
2786        }
2787        #endif
2788        fp->fPatIdx++;
2789
2790        switch (opType) {
2791
2792
2793        case URX_NOP:
2794            break;
2795
2796
2797        case URX_BACKTRACK:
2798            // Force a backtrack.  In some circumstances, the pattern compiler
2799            //   will notice that the pattern can't possibly match anything, and will
2800            //   emit one of these at that point.
2801            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2802            break;
2803
2804
2805        case URX_ONECHAR:
2806            if (fp->fInputIdx < fActiveLimit) {
2807                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2808                UChar32 c = UTEXT_NEXT32(fInputText);
2809                if (c == opValue) {
2810                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2811                    break;
2812                }
2813            } else {
2814                fHitEnd = TRUE;
2815            }
2816
2817            #ifdef REGEX_SMART_BACKTRACKING
2818            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
2819                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2820                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2821                    UBool success = FALSE;
2822                    UChar32 c = UTEXT_PREVIOUS32(fInputText);
2823                    while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
2824                        if (c == opValue) {
2825                            success = TRUE;
2826                            break;
2827                        } else if (c == U_SENTINEL) {
2828                            break;
2829                        }
2830                        c = UTEXT_PREVIOUS32(fInputText);
2831                    }
2832                    if (success) {
2833                        fHitEnd = FALSE;
2834                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2835                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2836                        if (fp->fInputIdx > backSearchIndex) {
2837                            fp = StateSave(fp, fp->fPatIdx, status);
2838                        }
2839                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
2840                        break;
2841                    }
2842                }
2843            }
2844            #endif
2845
2846            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2847            break;
2848
2849
2850        case URX_STRING:
2851            {
2852                // Test input against a literal string.
2853                // Strings require two slots in the compiled pattern, one for the
2854                //   offset to the string text, and one for the length.
2855                int32_t   stringStartIdx = opValue;
2856                int32_t   stringLen;
2857
2858                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2859                fp->fPatIdx++;
2860                opType    = URX_TYPE(op);
2861                stringLen = URX_VAL(op);
2862                U_ASSERT(opType == URX_STRING_LEN);
2863                U_ASSERT(stringLen >= 2);
2864
2865                const UChar *patternChars = litText+stringStartIdx;
2866                const UChar *patternEnd = patternChars+stringLen;
2867
2868                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2869                UChar32 c;
2870                UBool success = TRUE;
2871
2872                while (patternChars < patternEnd && success) {
2873                    c = UTEXT_NEXT32(fInputText);
2874
2875                    if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
2876                        if (U_IS_BMP(c)) {
2877                            success = (*patternChars == c);
2878                            patternChars += 1;
2879                        } else if (patternChars+1 < patternEnd) {
2880                            success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
2881                            patternChars += 2;
2882                        }
2883                    } else {
2884                        success = FALSE;
2885                        fHitEnd = TRUE;          //   TODO:  See ticket 6074
2886                    }
2887                }
2888
2889                if (success) {
2890                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2891                } else {
2892                    #ifdef REGEX_SMART_BACKTRACKING
2893                    if (fp->fInputIdx > backSearchIndex && fStack->size()) {
2894                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2895                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2896                            // Reset to last start point
2897                            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2898                            patternChars = litText+stringStartIdx;
2899
2900                            // Search backwards for a possible start
2901                            do {
2902                                c = UTEXT_PREVIOUS32(fInputText);
2903                                if (c == U_SENTINEL) {
2904                                    break;
2905                                } else if ((U_IS_BMP(c) && *patternChars == c) ||
2906                                    (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
2907                                    success = TRUE;
2908                                    break;
2909                                }
2910                            } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
2911
2912                            // And try again
2913                            if (success) {
2914                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2915                                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2916                                if (fp->fInputIdx > backSearchIndex) {
2917                                    fp = StateSave(fp, fp->fPatIdx, status);
2918                                }
2919                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
2920                                break;
2921                            }
2922                        }
2923                    }
2924                    #endif
2925                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2926                }
2927            }
2928            break;
2929
2930
2931        case URX_STATE_SAVE:
2932            fp = StateSave(fp, opValue, status);
2933            break;
2934
2935
2936        case URX_END:
2937            // The match loop will exit via this path on a successful match,
2938            //   when we reach the end of the pattern.
2939            if (toEnd && fp->fInputIdx != fActiveLimit) {
2940                // The pattern matched, but not to the end of input.  Try some more.
2941                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2942                break;
2943            }
2944            isMatch = TRUE;
2945            goto  breakFromLoop;
2946
2947        // Start and End Capture stack frame variables are laid out out like this:
2948            //  fp->fExtra[opValue]  - The start of a completed capture group
2949            //             opValue+1 - The end   of a completed capture group
2950            //             opValue+2 - the start of a capture group whose end
2951            //                          has not yet been reached (and might not ever be).
2952        case URX_START_CAPTURE:
2953            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2954            fp->fExtra[opValue+2] = fp->fInputIdx;
2955            break;
2956
2957
2958        case URX_END_CAPTURE:
2959            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2960            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2961            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2962            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2963            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2964            break;
2965
2966
2967        case URX_DOLLAR:                   //  $, test for End of line
2968                                           //     or for position before new line at end of input
2969            {
2970                if (fp->fInputIdx >= fAnchorLimit) {
2971                    // We really are at the end of input.  Success.
2972                    fHitEnd = TRUE;
2973                    fRequireEnd = TRUE;
2974                    break;
2975                }
2976
2977                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2978
2979                // If we are positioned just before a new-line that is located at the
2980                //   end of input, succeed.
2981                UChar32 c = UTEXT_NEXT32(fInputText);
2982                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2983                    if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2984                        // If not in the middle of a CR/LF sequence
2985                        if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2986                            // At new-line at end of input. Success
2987                            fHitEnd = TRUE;
2988                            fRequireEnd = TRUE;
2989
2990                            break;
2991                        }
2992                    }
2993                } else {
2994                    UChar32 nextC = UTEXT_NEXT32(fInputText);
2995                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2996                        fHitEnd = TRUE;
2997                        fRequireEnd = TRUE;
2998                        break;                         // At CR/LF at end of input.  Success
2999                    }
3000                }
3001
3002                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3003            }
3004            break;
3005
3006
3007         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
3008            if (fp->fInputIdx >= fAnchorLimit) {
3009                // Off the end of input.  Success.
3010                fHitEnd = TRUE;
3011                fRequireEnd = TRUE;
3012                break;
3013            } else {
3014                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3015                UChar32 c = UTEXT_NEXT32(fInputText);
3016                // Either at the last character of input, or off the end.
3017                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3018                    fHitEnd = TRUE;
3019                    fRequireEnd = TRUE;
3020                    break;
3021                }
3022            }
3023
3024            // Not at end of input.  Back-track out.
3025            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3026            break;
3027
3028
3029         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3030             {
3031                 if (fp->fInputIdx >= fAnchorLimit) {
3032                     // We really are at the end of input.  Success.
3033                     fHitEnd = TRUE;
3034                     fRequireEnd = TRUE;
3035                     break;
3036                 }
3037                 // If we are positioned just before a new-line, succeed.
3038                 // It makes no difference where the new-line is within the input.
3039                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3040                 UChar32 c = UTEXT_CURRENT32(fInputText);
3041                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3042                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3043                     //  In multi-line mode, hitting a new-line just before the end of input does not
3044                     //   set the hitEnd or requireEnd flags
3045                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3046                        break;
3047                     }
3048                 }
3049                 // not at a new line.  Fail.
3050                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3051             }
3052             break;
3053
3054
3055         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3056             {
3057                 if (fp->fInputIdx >= fAnchorLimit) {
3058                     // We really are at the end of input.  Success.
3059                     fHitEnd = TRUE;
3060                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3061                     break;               //   adding a new-line would not lose the match.
3062                 }
3063                 // If we are not positioned just before a new-line, the test fails; backtrack out.
3064                 // It makes no difference where the new-line is within the input.
3065                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3066                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3067                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3068                 }
3069             }
3070             break;
3071
3072
3073       case URX_CARET:                    //  ^, test for start of line
3074            if (fp->fInputIdx != fAnchorStart) {
3075                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3076            }
3077            break;
3078
3079
3080       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3081           {
3082               if (fp->fInputIdx == fAnchorStart) {
3083                   // We are at the start input.  Success.
3084                   break;
3085               }
3086               // Check whether character just before the current pos is a new-line
3087               //   unless we are at the end of input
3088               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3089               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3090               if ((fp->fInputIdx < fAnchorLimit) &&
3091                   ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3092                   //  It's a new-line.  ^ is true.  Success.
3093                   //  TODO:  what should be done with positions between a CR and LF?
3094                   break;
3095               }
3096               // Not at the start of a line.  Fail.
3097               fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3098           }
3099           break;
3100
3101
3102       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3103           {
3104               U_ASSERT(fp->fInputIdx >= fAnchorStart);
3105               if (fp->fInputIdx <= fAnchorStart) {
3106                   // We are at the start input.  Success.
3107                   break;
3108               }
3109               // Check whether character just before the current pos is a new-line
3110               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3111               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3112               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3113               if (c != 0x0a) {
3114                   // Not at the start of a line.  Back-track out.
3115                   fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3116               }
3117           }
3118           break;
3119
3120        case URX_BACKSLASH_B:          // Test for word boundaries
3121            {
3122                UBool success = isWordBoundary(fp->fInputIdx);
3123                success ^= (opValue != 0);     // flip sense for \B
3124                if (!success) {
3125                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3126                }
3127            }
3128            break;
3129
3130
3131        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3132            {
3133                UBool success = isUWordBoundary(fp->fInputIdx);
3134                success ^= (opValue != 0);     // flip sense for \B
3135                if (!success) {
3136                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3137                }
3138            }
3139            break;
3140
3141
3142        case URX_BACKSLASH_D:            // Test for decimal digit
3143            {
3144                if (fp->fInputIdx >= fActiveLimit) {
3145                    fHitEnd = TRUE;
3146                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3147                    break;
3148                }
3149
3150                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3151
3152                UChar32 c = UTEXT_NEXT32(fInputText);
3153                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3154                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3155                success ^= (opValue != 0);        // flip sense for \D
3156                if (success) {
3157                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3158                } else {
3159                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3160                }
3161            }
3162            break;
3163
3164
3165        case URX_BACKSLASH_G:          // Test for position at end of previous match
3166            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3167                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3168            }
3169            break;
3170
3171
3172        case URX_BACKSLASH_X:
3173            //  Match a Grapheme, as defined by Unicode TR 29.
3174            //  Differs slightly from Perl, which consumes combining marks independently
3175            //    of context.
3176            {
3177
3178                // Fail if at end of input
3179                if (fp->fInputIdx >= fActiveLimit) {
3180                    fHitEnd = TRUE;
3181                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3182                    break;
3183                }
3184
3185                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3186
3187                // Examine (and consume) the current char.
3188                //   Dispatch into a little state machine, based on the char.
3189                UChar32  c;
3190                c = UTEXT_NEXT32(fInputText);
3191                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3192                UnicodeSet **sets = fPattern->fStaticSets;
3193                if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3194                if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3195                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3196                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3197                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3198                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3199                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3200                goto GC_Extend;
3201
3202
3203
3204GC_L:
3205                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3206                c = UTEXT_NEXT32(fInputText);
3207                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3208                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3209                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3210                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3211                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3212                UTEXT_PREVIOUS32(fInputText);
3213                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3214                goto GC_Extend;
3215
3216GC_V:
3217                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3218                c = UTEXT_NEXT32(fInputText);
3219                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3220                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3221                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3222                UTEXT_PREVIOUS32(fInputText);
3223                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3224                goto GC_Extend;
3225
3226GC_T:
3227                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3228                c = UTEXT_NEXT32(fInputText);
3229                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3230                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3231                UTEXT_PREVIOUS32(fInputText);
3232                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3233                goto GC_Extend;
3234
3235GC_Extend:
3236                // Combining characters are consumed here
3237                for (;;) {
3238                    if (fp->fInputIdx >= fActiveLimit) {
3239                        break;
3240                    }
3241                    c = UTEXT_CURRENT32(fInputText);
3242                    if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3243                        break;
3244                    }
3245                    UTEXT_NEXT32(fInputText);
3246                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247                }
3248                goto GC_Done;
3249
3250GC_Control:
3251                // Most control chars stand alone (don't combine with combining chars),
3252                //   except for that CR/LF sequence is a single grapheme cluster.
3253                if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3254                    c = UTEXT_NEXT32(fInputText);
3255                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3256                }
3257
3258GC_Done:
3259                if (fp->fInputIdx >= fActiveLimit) {
3260                    fHitEnd = TRUE;
3261                }
3262                break;
3263            }
3264
3265
3266
3267
3268        case URX_BACKSLASH_Z:          // Test for end of Input
3269            if (fp->fInputIdx < fAnchorLimit) {
3270                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3271            } else {
3272                fHitEnd = TRUE;
3273                fRequireEnd = TRUE;
3274            }
3275            break;
3276
3277
3278
3279        case URX_STATIC_SETREF:
3280            {
3281                // Test input character against one of the predefined sets
3282                //    (Word Characters, for example)
3283                // The high bit of the op value is a flag for the match polarity.
3284                //    0:   success if input char is in set.
3285                //    1:   success if input char is not in set.
3286                if (fp->fInputIdx >= fActiveLimit) {
3287                    fHitEnd = TRUE;
3288                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3289                    break;
3290                }
3291
3292                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3293                opValue &= ~URX_NEG_SET;
3294                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3295
3296                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3297                UChar32 c = UTEXT_NEXT32(fInputText);
3298                if (c < 256) {
3299                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3300                    if (s8->contains(c)) {
3301                        success = !success;
3302                    }
3303                } else {
3304                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3305                    if (s->contains(c)) {
3306                        success = !success;
3307                    }
3308                }
3309                if (success) {
3310                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3311                } else {
3312                    // the character wasn't in the set.
3313                    #ifdef REGEX_SMART_BACKTRACKING
3314                    if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3315                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3316                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3317                            // Try to find it, backwards
3318                            UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3319                            success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
3320                            do {
3321                                c = UTEXT_PREVIOUS32(fInputText);
3322                                if (c == U_SENTINEL) {
3323                                    break;
3324                                } else if (c < 256) {
3325                                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3326                                    if (s8->contains(c)) {
3327                                        success = !success;
3328                                    }
3329                                } else {
3330                                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3331                                    if (s->contains(c)) {
3332                                        success = !success;
3333                                    }
3334                                }
3335                            } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
3336
3337                            if (success && c != U_SENTINEL) {
3338                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3339                                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3340                                if (fp->fInputIdx > backSearchIndex) {
3341                                    fp = StateSave(fp, fp->fPatIdx, status);
3342                                }
3343                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
3344                                break;
3345                            }
3346                        }
3347                    }
3348                    #endif
3349                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3350                }
3351            }
3352            break;
3353
3354
3355        case URX_STAT_SETREF_N:
3356            {
3357                // Test input character for NOT being a member of  one of
3358                //    the predefined sets (Word Characters, for example)
3359                if (fp->fInputIdx >= fActiveLimit) {
3360                    fHitEnd = TRUE;
3361                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3362                    break;
3363                }
3364
3365                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3366
3367                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3368
3369                UChar32 c = UTEXT_NEXT32(fInputText);
3370                if (c < 256) {
3371                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3372                    if (s8->contains(c) == FALSE) {
3373                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3374                        break;
3375                    }
3376                } else {
3377                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3378                    if (s->contains(c) == FALSE) {
3379                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3380                        break;
3381                    }
3382                }
3383                // the character wasn't in the set.
3384                #ifdef REGEX_SMART_BACKTRACKING
3385                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3386                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3387                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3388                        // Try to find it, backwards
3389                        UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3390                        UBool success = FALSE;
3391                        do {
3392                            c = UTEXT_PREVIOUS32(fInputText);
3393                            if (c == U_SENTINEL) {
3394                                break;
3395                            } else if (c < 256) {
3396                                Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3397                                if (s8->contains(c) == FALSE) {
3398                                    success = TRUE;
3399                                    break;
3400                                }
3401                            } else {
3402                                const UnicodeSet *s = fPattern->fStaticSets[opValue];
3403                                if (s->contains(c) == FALSE) {
3404                                    success = TRUE;
3405                                    break;
3406                                }
3407                            }
3408                        } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3409
3410                        if (success) {
3411                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3412                            fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3413                            if (fp->fInputIdx > backSearchIndex) {
3414                                fp = StateSave(fp, fp->fPatIdx, status);
3415                            }
3416                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
3417                            break;
3418                        }
3419                    }
3420                }
3421                #endif
3422                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3423            }
3424            break;
3425
3426
3427        case URX_SETREF:
3428            if (fp->fInputIdx >= fActiveLimit) {
3429                fHitEnd = TRUE;
3430                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3431                break;
3432            } else {
3433                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3434
3435                // There is input left.  Pick up one char and test it for set membership.
3436                UChar32 c = UTEXT_NEXT32(fInputText);
3437                U_ASSERT(opValue > 0 && opValue < sets->size());
3438                if (c<256) {
3439                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3440                    if (s8->contains(c)) {
3441                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3442                        break;
3443                    }
3444                } else {
3445                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3446                    if (s->contains(c)) {
3447                        // The character is in the set.  A Match.
3448                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3449                        break;
3450                    }
3451                }
3452
3453                // the character wasn't in the set.
3454                #ifdef REGEX_SMART_BACKTRACKING
3455                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3456                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3457                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3458                        // Try to find it, backwards
3459                        UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3460                        UBool success = FALSE;
3461                        do {
3462                            c = UTEXT_PREVIOUS32(fInputText);
3463                            if (c == U_SENTINEL) {
3464                                break;
3465                            } else if (c < 256) {
3466                                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3467                                if (s8->contains(c)) {
3468                                    success = TRUE;
3469                                    break;
3470                                }
3471                            } else {
3472                                UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3473                                if (s->contains(c)) {
3474                                    success = TRUE;
3475                                    break;
3476                                }
3477                            }
3478                        } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3479
3480                        if (success) {
3481                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3482                            fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3483                            if (fp->fInputIdx > backSearchIndex) {
3484                                fp = StateSave(fp, fp->fPatIdx, status);
3485                            }
3486                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
3487                            break;
3488                        }
3489                    }
3490                }
3491                #endif
3492                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3493            }
3494            break;
3495
3496
3497        case URX_DOTANY:
3498            {
3499                // . matches anything, but stops at end-of-line.
3500                if (fp->fInputIdx >= fActiveLimit) {
3501                    // At end of input.  Match failed.  Backtrack out.
3502                    fHitEnd = TRUE;
3503                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3504                    break;
3505                }
3506
3507                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3508
3509                // There is input left.  Advance over one char, unless we've hit end-of-line
3510                UChar32 c = UTEXT_NEXT32(fInputText);
3511                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
3512                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3513                    // End of line in normal mode.   . does not match.
3514                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3515                    break;
3516                }
3517                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3518            }
3519            break;
3520
3521
3522        case URX_DOTANY_ALL:
3523            {
3524                // ., in dot-matches-all (including new lines) mode
3525                if (fp->fInputIdx >= fActiveLimit) {
3526                    // At end of input.  Match failed.  Backtrack out.
3527                    fHitEnd = TRUE;
3528                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3529                    break;
3530                }
3531
3532                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3533
3534                // There is input left.  Advance over one char, except if we are
3535                //   at a cr/lf, advance over both of them.
3536                UChar32 c;
3537                c = UTEXT_NEXT32(fInputText);
3538                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3539                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3540                    // In the case of a CR/LF, we need to advance over both.
3541                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
3542                    if (nextc == 0x0a) {
3543                        UTEXT_NEXT32(fInputText);
3544                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3545                    }
3546                }
3547            }
3548            break;
3549
3550
3551        case URX_DOTANY_UNIX:
3552            {
3553                // '.' operator, matches all, but stops at end-of-line.
3554                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3555                if (fp->fInputIdx >= fActiveLimit) {
3556                    // At end of input.  Match failed.  Backtrack out.
3557                    fHitEnd = TRUE;
3558                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3559                    break;
3560                }
3561
3562                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3563
3564                // There is input left.  Advance over one char, unless we've hit end-of-line
3565                UChar32 c = UTEXT_NEXT32(fInputText);
3566                if (c == 0x0a) {
3567                    // End of line in normal mode.   '.' does not match the \n
3568                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3569                } else {
3570                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3571                }
3572            }
3573            break;
3574
3575
3576        case URX_JMP:
3577            fp->fPatIdx = opValue;
3578            break;
3579
3580        case URX_FAIL:
3581            isMatch = FALSE;
3582            goto breakFromLoop;
3583
3584        case URX_JMP_SAV:
3585            U_ASSERT(opValue < fPattern->fCompiledPat->size());
3586            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3587            fp->fPatIdx = opValue;                         // Then JMP.
3588            break;
3589
3590        case URX_JMP_SAV_X:
3591            // This opcode is used with (x)+, when x can match a zero length string.
3592            // Same as JMP_SAV, except conditional on the match having made forward progress.
3593            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3594            //   data address of the input position at the start of the loop.
3595            {
3596                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3597                int32_t  stoOp = (int32_t)pat[opValue-1];
3598                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3599                int32_t  frameLoc = URX_VAL(stoOp);
3600                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3601                int64_t prevInputIdx = fp->fExtra[frameLoc];
3602                U_ASSERT(prevInputIdx <= fp->fInputIdx);
3603                if (prevInputIdx < fp->fInputIdx) {
3604                    // The match did make progress.  Repeat the loop.
3605                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3606                    fp->fPatIdx = opValue;
3607                    fp->fExtra[frameLoc] = fp->fInputIdx;
3608                }
3609                // If the input position did not advance, we do nothing here,
3610                //   execution will fall out of the loop.
3611            }
3612            break;
3613
3614        case URX_CTR_INIT:
3615            {
3616                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3617                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3618
3619                // Pick up the three extra operands that CTR_INIT has, and
3620                //    skip the pattern location counter past
3621                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3622                fp->fPatIdx += 3;
3623                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3624                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3625                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3626                U_ASSERT(minCount>=0);
3627                U_ASSERT(maxCount>=minCount || maxCount==-1);
3628                U_ASSERT(loopLoc>fp->fPatIdx);
3629
3630                if (minCount == 0) {
3631                    fp = StateSave(fp, loopLoc+1, status);
3632                }
3633                if (maxCount == 0) {
3634                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3635                }
3636            }
3637            break;
3638
3639        case URX_CTR_LOOP:
3640            {
3641                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3642                int32_t initOp = (int32_t)pat[opValue];
3643                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3644                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3645                int32_t minCount  = (int32_t)pat[opValue+2];
3646                int32_t maxCount  = (int32_t)pat[opValue+3];
3647                // Increment the counter.  Note: we DIDN'T worry about counter
3648                //   overflow, since the data comes from UnicodeStrings, which
3649                //   stores its length in an int32_t. Do we have to think about
3650                //   this now that we're using UText? Probably not, since the length
3651                //   in UChar32s is still an int32_t.
3652                (*pCounter)++;
3653                U_ASSERT(*pCounter > 0);
3654                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3655                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
3656                    break;
3657                }
3658                if (*pCounter >= minCount) {
3659                    fp = StateSave(fp, fp->fPatIdx, status);
3660                }
3661                fp->fPatIdx = opValue + 4;    // Loop back.
3662            }
3663            break;
3664
3665        case URX_CTR_INIT_NG:
3666            {
3667                // Initialize a non-greedy loop
3668                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3669                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3670
3671                // Pick up the three extra operands that CTR_INIT has, and
3672                //    skip the pattern location counter past
3673                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3674                fp->fPatIdx += 3;
3675                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3676                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3677                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3678                U_ASSERT(minCount>=0);
3679                U_ASSERT(maxCount>=minCount || maxCount==-1);
3680                U_ASSERT(loopLoc>fp->fPatIdx);
3681
3682                if (minCount == 0) {
3683                    if (maxCount != 0) {
3684                        fp = StateSave(fp, fp->fPatIdx, status);
3685                    }
3686                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3687                }
3688            }
3689            break;
3690
3691        case URX_CTR_LOOP_NG:
3692            {
3693                // Non-greedy {min, max} loops
3694                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3695                int32_t initOp = (int32_t)pat[opValue];
3696                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3697                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3698                int32_t minCount  = (int32_t)pat[opValue+2];
3699                int32_t maxCount  = (int32_t)pat[opValue+3];
3700                // Increment the counter.  Note: we DIDN'T worry about counter
3701                //   overflow, since the data comes from UnicodeStrings, which
3702                //   stores its length in an int32_t. Do we have to think about
3703                //   this now that we're using UText? Probably not, since the length
3704                //   in UChar32s is still an int32_t.
3705                (*pCounter)++;
3706                U_ASSERT(*pCounter > 0);
3707
3708                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3709                    // The loop has matched the maximum permitted number of times.
3710                    //   Break out of here with no action.  Matching will
3711                    //   continue with the following pattern.
3712                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
3713                    break;
3714                }
3715
3716                if (*pCounter < minCount) {
3717                    // We haven't met the minimum number of matches yet.
3718                    //   Loop back for another one.
3719                    fp->fPatIdx = opValue + 4;    // Loop back.
3720                } else {
3721                    // We do have the minimum number of matches.
3722                    //   Fall into the following pattern, but first do
3723                    //   a state save to the top of the loop, so that a failure
3724                    //   in the following pattern will try another iteration of the loop.
3725                    fp = StateSave(fp, opValue + 4, status);
3726                }
3727            }
3728            break;
3729
3730        case URX_STO_SP:
3731            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3732            fData[opValue] = fStack->size();
3733            break;
3734
3735        case URX_LD_SP:
3736            {
3737                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3738                int32_t newStackSize = (int32_t)fData[opValue];
3739                U_ASSERT(newStackSize <= fStack->size());
3740                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3741                if (newFP == (int64_t *)fp) {
3742                    break;
3743                }
3744                int32_t i;
3745                for (i=0; i<fFrameSize; i++) {
3746                    newFP[i] = ((int64_t *)fp)[i];
3747                }
3748                fp = (REStackFrame *)newFP;
3749                fStack->setSize(newStackSize);
3750            }
3751            break;
3752
3753        case URX_BACKREF:
3754        case URX_BACKREF_I:
3755            {
3756                U_ASSERT(opValue < fFrameSize);
3757                int64_t groupStartIdx = fp->fExtra[opValue];
3758                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3759                U_ASSERT(groupStartIdx <= groupEndIdx);
3760                if (groupStartIdx < 0) {
3761                    // This capture group has not participated in the match thus far,
3762                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3763                }
3764
3765                if (groupEndIdx == groupStartIdx) {
3766                    //   The capture group match was of an empty string.
3767                    //   Verified by testing:  Perl matches succeed in this case, so
3768                    //   we do too.
3769                    break;
3770                }
3771
3772                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3773                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3774
3775                UBool haveMatch = (opType == URX_BACKREF ?
3776                    (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
3777                    (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
3778                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3779
3780                if (fp->fInputIdx > fActiveLimit) {
3781                    fHitEnd = TRUE;
3782                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3783                } else if (!haveMatch) {
3784                    if (fp->fInputIdx == fActiveLimit) {
3785                        fHitEnd = TRUE;
3786                    }
3787                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3788                }
3789            }
3790            break;
3791
3792        case URX_STO_INP_LOC:
3793            {
3794                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3795                fp->fExtra[opValue] = fp->fInputIdx;
3796            }
3797            break;
3798
3799        case URX_JMPX:
3800            {
3801                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3802                fp->fPatIdx += 1;
3803                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3804                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3805                int64_t savedInputIdx = fp->fExtra[dataLoc];
3806                U_ASSERT(savedInputIdx <= fp->fInputIdx);
3807                if (savedInputIdx < fp->fInputIdx) {
3808                    fp->fPatIdx = opValue;                               // JMP
3809                } else {
3810                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3811                }
3812            }
3813            break;
3814
3815        case URX_LA_START:
3816            {
3817                // Entering a lookahead block.
3818                // Save Stack Ptr, Input Pos.
3819                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3820                fData[opValue]   = fStack->size();
3821                fData[opValue+1] = fp->fInputIdx;
3822                fActiveStart     = fLookStart;          // Set the match region change for
3823                fActiveLimit     = fLookLimit;          //   transparent bounds.
3824            }
3825            break;
3826
3827        case URX_LA_END:
3828            {
3829                // Leaving a look-ahead block.
3830                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3831                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3832                int32_t stackSize = fStack->size();
3833                int32_t newStackSize =(int32_t)fData[opValue];
3834                U_ASSERT(stackSize >= newStackSize);
3835                if (stackSize > newStackSize) {
3836                    // Copy the current top frame back to the new (cut back) top frame.
3837                    //   This makes the capture groups from within the look-ahead
3838                    //   expression available.
3839                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3840                    int32_t i;
3841                    for (i=0; i<fFrameSize; i++) {
3842                        newFP[i] = ((int64_t *)fp)[i];
3843                    }
3844                    fp = (REStackFrame *)newFP;
3845                    fStack->setSize(newStackSize);
3846                }
3847                fp->fInputIdx = fData[opValue+1];
3848
3849                // Restore the active region bounds in the input string; they may have
3850                //    been changed because of transparent bounds on a Region.
3851                fActiveStart = fRegionStart;
3852                fActiveLimit = fRegionLimit;
3853            }
3854            break;
3855
3856        case URX_ONECHAR_I:
3857            if (fp->fInputIdx < fActiveLimit) {
3858                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3859
3860                UChar32 c = UTEXT_NEXT32(fInputText);
3861                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3862                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3863                    break;
3864                }
3865            } else {
3866                fHitEnd = TRUE;
3867            }
3868
3869            #ifdef REGEX_SMART_BACKTRACKING
3870            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3871                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3872                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3873                    UBool success = FALSE;
3874                    UChar32 c = UTEXT_PREVIOUS32(fInputText);
3875                    while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
3876                        if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3877                            success = TRUE;
3878                            break;
3879                        } else if (c == U_SENTINEL) {
3880                            break;
3881                        }
3882                        c = UTEXT_PREVIOUS32(fInputText);
3883                    }
3884                    if (success) {
3885                        fHitEnd = FALSE;
3886                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3887                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3888                        if (fp->fInputIdx > backSearchIndex) {
3889                            fp = StateSave(fp, fp->fPatIdx, status);
3890                        }
3891                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
3892                        break;
3893                    }
3894                }
3895            }
3896            #endif
3897
3898            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3899            break;
3900
3901        case URX_STRING_I:
3902            {
3903                // Test input against a literal string.
3904                // Strings require two slots in the compiled pattern, one for the
3905                //   offset to the string text, and one for the length.
3906                const UCaseProps *csp = ucase_getSingleton();
3907                {
3908                    int32_t stringStartIdx, stringLen;
3909                    stringStartIdx = opValue;
3910
3911                    op      = (int32_t)pat[fp->fPatIdx];
3912                    fp->fPatIdx++;
3913                    opType  = URX_TYPE(op);
3914                    opValue = URX_VAL(op);
3915                    U_ASSERT(opType == URX_STRING_LEN);
3916                    stringLen = opValue;
3917
3918                    const UChar *patternChars = litText+stringStartIdx;
3919                    const UChar *patternEnd = patternChars+stringLen;
3920
3921                    const UChar *foldChars = NULL;
3922                    int32_t foldOffset, foldLength;
3923                    UChar32 c;
3924
3925                    foldOffset = foldLength = 0;
3926                    UBool success = TRUE;
3927
3928                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3929                    while (patternChars < patternEnd && success) {
3930                        if(foldOffset < foldLength) {
3931                            U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3932                        } else {
3933                            c = UTEXT_NEXT32(fInputText);
3934                            if (c != U_SENTINEL) {
3935                                foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3936                                if(foldLength >= 0) {
3937                                    if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
3938                                        foldOffset = 0;
3939                                        U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3940                                    } else {
3941                                        c = foldLength;
3942                                        foldLength = foldOffset; // to avoid reading chars from the folding buffer
3943                                    }
3944                                }
3945                            }
3946
3947                            fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3948                        }
3949
3950                        success = FALSE;
3951                        if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
3952                            if (U_IS_BMP(c)) {
3953                                success = (*patternChars == c);
3954                                patternChars += 1;
3955                            } else if (patternChars+1 < patternEnd) {
3956                                success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
3957                                patternChars += 2;
3958                            }
3959                        } else {
3960                            fHitEnd = TRUE;          //   TODO:  See ticket 6074
3961                        }
3962                    }
3963
3964                    if (!success) {
3965                        #ifdef REGEX_SMART_BACKTRACKING
3966                        if (fp->fInputIdx > backSearchIndex && fStack->size()) {
3967                            REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3968                            if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3969                                // Reset to last start point
3970                                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3971                                patternChars = litText+stringStartIdx;
3972
3973                                // Search backwards for a possible start
3974                                do {
3975                                    c = UTEXT_PREVIOUS32(fInputText);
3976                                    if (c == U_SENTINEL) {
3977                                        break;
3978                                    } else {
3979                                        foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3980                                        if(foldLength >= 0) {
3981                                            if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
3982                                                foldOffset = 0;
3983                                                U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3984                                            } else {
3985                                                c = foldLength;
3986                                                foldLength = foldOffset; // to avoid reading chars from the folding buffer
3987                                            }
3988                                        }
3989
3990                                        if ((U_IS_BMP(c) && *patternChars == c) ||
3991                                               (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
3992                                            success = TRUE;
3993                                            break;
3994                                        }
3995                                    }
3996                                } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3997
3998                                // And try again
3999                                if (success) {
4000                                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4001                                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4002                                    if (fp->fInputIdx > backSearchIndex) {
4003                                        fp = StateSave(fp, fp->fPatIdx, status);
4004                                    }
4005                                    fp->fPatIdx++; // Skip the LOOP_C, we just did that
4006                                    break;
4007                                }
4008                            }
4009                        }
4010                        #endif
4011                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4012                    }
4013                }
4014            }
4015            break;
4016
4017        case URX_LB_START:
4018            {
4019                // Entering a look-behind block.
4020                // Save Stack Ptr, Input Pos.
4021                //   TODO:  implement transparent bounds.  Ticket #6067
4022                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4023                fData[opValue]   = fStack->size();
4024                fData[opValue+1] = fp->fInputIdx;
4025                // Init the variable containing the start index for attempted matches.
4026                fData[opValue+2] = -1;
4027                // Save input string length, then reset to pin any matches to end at
4028                //   the current position.
4029                fData[opValue+3] = fActiveLimit;
4030                fActiveLimit     = fp->fInputIdx;
4031            }
4032            break;
4033
4034
4035        case URX_LB_CONT:
4036            {
4037                // Positive Look-Behind, at top of loop checking for matches of LB expression
4038                //    at all possible input starting positions.
4039
4040                // Fetch the min and max possible match lengths.  They are the operands
4041                //   of this op in the pattern.
4042                int32_t minML = (int32_t)pat[fp->fPatIdx++];
4043                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4044                U_ASSERT(minML <= maxML);
4045                U_ASSERT(minML >= 0);
4046
4047                // Fetch (from data) the last input index where a match was attempted.
4048                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4049                int64_t  *lbStartIdx = &fData[opValue+2];
4050                if (*lbStartIdx < 0) {
4051                    // First time through loop.
4052                    *lbStartIdx = fp->fInputIdx - minML;
4053                } else {
4054                    // 2nd through nth time through the loop.
4055                    // Back up start position for match by one.
4056                    if (*lbStartIdx == 0) {
4057                        (*lbStartIdx)--;
4058                    } else {
4059                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4060                        UTEXT_PREVIOUS32(fInputText);
4061                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4062                    }
4063                }
4064
4065                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4066                    // We have tried all potential match starting points without
4067                    //  getting a match.  Backtrack out, and out of the
4068                    //   Look Behind altogether.
4069                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4070                    int64_t restoreInputLen = fData[opValue+3];
4071                    U_ASSERT(restoreInputLen >= fActiveLimit);
4072                    U_ASSERT(restoreInputLen <= fInputLength);
4073                    fActiveLimit = restoreInputLen;
4074                    break;
4075                }
4076
4077                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4078                //      (successful match will fall off the end of the loop.)
4079                fp = StateSave(fp, fp->fPatIdx-3, status);
4080                fp->fInputIdx = *lbStartIdx;
4081            }
4082            break;
4083
4084        case URX_LB_END:
4085            // End of a look-behind block, after a successful match.
4086            {
4087                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4088                if (fp->fInputIdx != fActiveLimit) {
4089                    //  The look-behind expression matched, but the match did not
4090                    //    extend all the way to the point that we are looking behind from.
4091                    //  FAIL out of here, which will take us back to the LB_CONT, which
4092                    //     will retry the match starting at another position or fail
4093                    //     the look-behind altogether, whichever is appropriate.
4094                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4095                    break;
4096                }
4097
4098                // Look-behind match is good.  Restore the orignal input string length,
4099                //   which had been truncated to pin the end of the lookbehind match to the
4100                //   position being looked-behind.
4101                int64_t originalInputLen = fData[opValue+3];
4102                U_ASSERT(originalInputLen >= fActiveLimit);
4103                U_ASSERT(originalInputLen <= fInputLength);
4104                fActiveLimit = originalInputLen;
4105            }
4106            break;
4107
4108
4109        case URX_LBN_CONT:
4110            {
4111                // Negative Look-Behind, at top of loop checking for matches of LB expression
4112                //    at all possible input starting positions.
4113
4114                // Fetch the extra parameters of this op.
4115                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4116                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4117                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4118                        continueLoc = URX_VAL(continueLoc);
4119                U_ASSERT(minML <= maxML);
4120                U_ASSERT(minML >= 0);
4121                U_ASSERT(continueLoc > fp->fPatIdx);
4122
4123                // Fetch (from data) the last input index where a match was attempted.
4124                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4125                int64_t  *lbStartIdx = &fData[opValue+2];
4126                if (*lbStartIdx < 0) {
4127                    // First time through loop.
4128                    *lbStartIdx = fp->fInputIdx - minML;
4129                } else {
4130                    // 2nd through nth time through the loop.
4131                    // Back up start position for match by one.
4132                    if (*lbStartIdx == 0) {
4133                        (*lbStartIdx)--;
4134                    } else {
4135                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4136                        UTEXT_PREVIOUS32(fInputText);
4137                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4138                    }
4139                }
4140
4141                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4142                    // We have tried all potential match starting points without
4143                    //  getting a match, which means that the negative lookbehind as
4144                    //  a whole has succeeded.  Jump forward to the continue location
4145                    int64_t restoreInputLen = fData[opValue+3];
4146                    U_ASSERT(restoreInputLen >= fActiveLimit);
4147                    U_ASSERT(restoreInputLen <= fInputLength);
4148                    fActiveLimit = restoreInputLen;
4149                    fp->fPatIdx = continueLoc;
4150                    break;
4151                }
4152
4153                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4154                //      (successful match will cause a FAIL out of the loop altogether.)
4155                fp = StateSave(fp, fp->fPatIdx-4, status);
4156                fp->fInputIdx = *lbStartIdx;
4157            }
4158            break;
4159
4160        case URX_LBN_END:
4161            // End of a negative look-behind block, after a successful match.
4162            {
4163                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4164                if (fp->fInputIdx != fActiveLimit) {
4165                    //  The look-behind expression matched, but the match did not
4166                    //    extend all the way to the point that we are looking behind from.
4167                    //  FAIL out of here, which will take us back to the LB_CONT, which
4168                    //     will retry the match starting at another position or succeed
4169                    //     the look-behind altogether, whichever is appropriate.
4170                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4171                    break;
4172                }
4173
4174                // Look-behind expression matched, which means look-behind test as
4175                //   a whole Fails
4176
4177                //   Restore the orignal input string length, which had been truncated
4178                //   inorder to pin the end of the lookbehind match
4179                //   to the position being looked-behind.
4180                int64_t originalInputLen = fData[opValue+3];
4181                U_ASSERT(originalInputLen >= fActiveLimit);
4182                U_ASSERT(originalInputLen <= fInputLength);
4183                fActiveLimit = originalInputLen;
4184
4185                // Restore original stack position, discarding any state saved
4186                //   by the successful pattern match.
4187                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4188                int32_t newStackSize = (int32_t)fData[opValue];
4189                U_ASSERT(fStack->size() > newStackSize);
4190                fStack->setSize(newStackSize);
4191
4192                //  FAIL, which will take control back to someplace
4193                //  prior to entering the look-behind test.
4194                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4195            }
4196            break;
4197
4198
4199        case URX_LOOP_SR_I:
4200            // Loop Initialization for the optimized implementation of
4201            //     [some character set]*
4202            //   This op scans through all matching input.
4203            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4204            {
4205                U_ASSERT(opValue > 0 && opValue < sets->size());
4206                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4207                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4208
4209                // Loop through input, until either the input is exhausted or
4210                //   we reach a character that is not a member of the set.
4211                int64_t ix = fp->fInputIdx;
4212                UTEXT_SETNATIVEINDEX(fInputText, ix);
4213                for (;;) {
4214                    if (ix >= fActiveLimit) {
4215                        fHitEnd = TRUE;
4216                        break;
4217                    }
4218                    UChar32 c = UTEXT_NEXT32(fInputText);
4219                    if (c<256) {
4220                        if (s8->contains(c) == FALSE) {
4221                            break;
4222                        }
4223                    } else {
4224                        if (s->contains(c) == FALSE) {
4225                            break;
4226                        }
4227                    }
4228                    ix = UTEXT_GETNATIVEINDEX(fInputText);
4229                }
4230
4231                // If there were no matching characters, skip over the loop altogether.
4232                //   The loop doesn't run at all, a * op always succeeds.
4233                if (ix == fp->fInputIdx) {
4234                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4235                    break;
4236                }
4237
4238                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4239                //   must follow.  It's operand is the stack location
4240                //   that holds the starting input index for the match of this [set]*
4241                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4242                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4243                int32_t stackLoc = URX_VAL(loopcOp);
4244                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4245                fp->fExtra[stackLoc] = fp->fInputIdx;
4246                #ifdef REGEX_SMART_BACKTRACKING
4247                backSearchIndex = fp->fInputIdx;
4248                #endif
4249                fp->fInputIdx = ix;
4250
4251                // Save State to the URX_LOOP_C op that follows this one,
4252                //   so that match failures in the following code will return to there.
4253                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4254                fp = StateSave(fp, fp->fPatIdx, status);
4255                fp->fPatIdx++;
4256            }
4257            break;
4258
4259
4260        case URX_LOOP_DOT_I:
4261            // Loop Initialization for the optimized implementation of .*
4262            //   This op scans through all remaining input.
4263            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4264            {
4265                // Loop through input until the input is exhausted (we reach an end-of-line)
4266                // In DOTALL mode, we can just go straight to the end of the input.
4267                int64_t ix;
4268                if ((opValue & 1) == 1) {
4269                    // Dot-matches-All mode.  Jump straight to the end of the string.
4270                    ix = fActiveLimit;
4271                    fHitEnd = TRUE;
4272                } else {
4273                    // NOT DOT ALL mode.  Line endings do not match '.'
4274                    // Scan forward until a line ending or end of input.
4275                    ix = fp->fInputIdx;
4276                    UTEXT_SETNATIVEINDEX(fInputText, ix);
4277                    for (;;) {
4278                        if (ix >= fActiveLimit) {
4279                            fHitEnd = TRUE;
4280                            break;
4281                        }
4282                        UChar32 c = UTEXT_NEXT32(fInputText);
4283                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4284                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4285                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4286                                    (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4287                                //  char is a line ending.  Exit the scanning loop.
4288                                break;
4289                            }
4290                        }
4291                        ix = UTEXT_GETNATIVEINDEX(fInputText);
4292                    }
4293                }
4294
4295                // If there were no matching characters, skip over the loop altogether.
4296                //   The loop doesn't run at all, a * op always succeeds.
4297                if (ix == fp->fInputIdx) {
4298                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4299                    break;
4300                }
4301
4302                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4303                //   must follow.  It's operand is the stack location
4304                //   that holds the starting input index for the match of this .*
4305                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4306                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4307                int32_t stackLoc = URX_VAL(loopcOp);
4308                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4309                fp->fExtra[stackLoc] = fp->fInputIdx;
4310                #ifdef REGEX_SMART_BACKTRACKING
4311                backSearchIndex = fp->fInputIdx;
4312                #endif
4313                fp->fInputIdx = ix;
4314
4315                // Save State to the URX_LOOP_C op that follows this one,
4316                //   so that match failures in the following code will return to there.
4317                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4318                fp = StateSave(fp, fp->fPatIdx, status);
4319                fp->fPatIdx++;
4320            }
4321            break;
4322
4323
4324        case URX_LOOP_C:
4325            {
4326                U_ASSERT(opValue>=0 && opValue<fFrameSize);
4327                backSearchIndex = fp->fExtra[opValue];
4328                U_ASSERT(backSearchIndex <= fp->fInputIdx);
4329                if (backSearchIndex == fp->fInputIdx) {
4330                    // We've backed up the input idx to the point that the loop started.
4331                    // The loop is done.  Leave here without saving state.
4332                    //  Subsequent failures won't come back here.
4333                    break;
4334                }
4335                // Set up for the next iteration of the loop, with input index
4336                //   backed up by one from the last time through,
4337                //   and a state save to this instruction in case the following code fails again.
4338                //   (We're going backwards because this loop emulates stack unwinding, not
4339                //    the initial scan forward.)
4340                U_ASSERT(fp->fInputIdx > 0);
4341                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4342                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4343                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4344
4345                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4346                if (prevC == 0x0a &&
4347                    fp->fInputIdx > backSearchIndex &&
4348                    twoPrevC == 0x0d) {
4349                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4350                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4351                        // .*, stepping back over CRLF pair.
4352                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4353                    }
4354                }
4355
4356
4357                fp = StateSave(fp, fp->fPatIdx-1, status);
4358            }
4359            break;
4360
4361
4362
4363        default:
4364            // Trouble.  The compiled pattern contains an entry with an
4365            //           unrecognized type tag.
4366            U_ASSERT(FALSE);
4367        }
4368
4369        if (U_FAILURE(status)) {
4370            isMatch = FALSE;
4371            break;
4372        }
4373    }
4374
4375breakFromLoop:
4376    fMatch = isMatch;
4377    if (isMatch) {
4378        fLastMatchEnd = fMatchEnd;
4379        fMatchStart   = startIdx;
4380        fMatchEnd     = fp->fInputIdx;
4381        if (fTraceDebug) {
4382            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
4383        }
4384    }
4385    else
4386    {
4387        if (fTraceDebug) {
4388            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4389        }
4390    }
4391
4392    fFrame = fp;                // The active stack frame when the engine stopped.
4393                                //   Contains the capture group results that we need to
4394                                //    access later.
4395    return;
4396}
4397
4398
4399//--------------------------------------------------------------------------------
4400//
4401//   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4402//                  assumption that the entire string is available in the UText's
4403//                  chunk buffer. For now, that means we can use int32_t indexes,
4404//                  except for anything that needs to be saved (like group starts
4405//                  and ends).
4406//
4407//                  startIdx:    begin matching a this index.
4408//                  toEnd:       if true, match must extend to end of the input region
4409//
4410//--------------------------------------------------------------------------------
4411void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4412    UBool       isMatch  = FALSE;      // True if the we have a match.
4413
4414    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4415
4416    int32_t     op;                    // Operation from the compiled pattern, split into
4417    int32_t     opType;                //    the opcode
4418    int32_t     opValue;               //    and the operand value.
4419
4420#ifdef REGEX_RUN_DEBUG
4421    if (fTraceDebug)
4422    {
4423        printf("MatchAt(startIdx=%ld)\n", startIdx);
4424        printf("Original Pattern: ");
4425        UChar32 c = utext_next32From(fPattern->fPattern, 0);
4426        while (c != U_SENTINEL) {
4427            if (c<32 || c>256) {
4428                c = '.';
4429            }
4430            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4431
4432            c = UTEXT_NEXT32(fPattern->fPattern);
4433        }
4434        printf("\n");
4435        printf("Input String: ");
4436        c = utext_next32From(fInputText, 0);
4437        while (c != U_SENTINEL) {
4438            if (c<32 || c>256) {
4439                c = '.';
4440            }
4441            printf("%c", c);
4442
4443            c = UTEXT_NEXT32(fInputText);
4444        }
4445        printf("\n");
4446        printf("\n");
4447    }
4448#endif
4449
4450    if (U_FAILURE(status)) {
4451        return;
4452    }
4453
4454    //  Cache frequently referenced items from the compiled pattern
4455    //
4456    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4457
4458    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4459    UVector             *sets          = fPattern->fSets;
4460
4461    const UChar         *inputBuf      = fInputText->chunkContents;
4462
4463    fFrameSize = fPattern->fFrameSize;
4464    REStackFrame        *fp            = resetStack();
4465
4466    fp->fPatIdx   = 0;
4467    fp->fInputIdx = startIdx;
4468
4469    // Zero out the pattern's static data
4470    int32_t i;
4471    for (i = 0; i<fPattern->fDataSize; i++) {
4472        fData[i] = 0;
4473    }
4474
4475    //
4476    //  Main loop for interpreting the compiled pattern.
4477    //  One iteration of the loop per pattern operation performed.
4478    //
4479    for (;;) {
4480#if 0
4481        if (_heapchk() != _HEAPOK) {
4482            fprintf(stderr, "Heap Trouble\n");
4483        }
4484#endif
4485
4486        op      = (int32_t)pat[fp->fPatIdx];
4487        opType  = URX_TYPE(op);
4488        opValue = URX_VAL(op);
4489#ifdef REGEX_RUN_DEBUG
4490        if (fTraceDebug) {
4491            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4492            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
4493                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4494            fPattern->dumpOp(fp->fPatIdx);
4495        }
4496#endif
4497        fp->fPatIdx++;
4498
4499        switch (opType) {
4500
4501
4502        case URX_NOP:
4503            break;
4504
4505
4506        case URX_BACKTRACK:
4507            // Force a backtrack.  In some circumstances, the pattern compiler
4508            //   will notice that the pattern can't possibly match anything, and will
4509            //   emit one of these at that point.
4510            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4511            break;
4512
4513
4514        case URX_ONECHAR:
4515            if (fp->fInputIdx < fActiveLimit) {
4516                UChar32 c;
4517                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4518                if (c == opValue) {
4519                    break;
4520                }
4521            } else {
4522                fHitEnd = TRUE;
4523            }
4524
4525            #ifdef REGEX_SMART_BACKTRACKING
4526            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
4527                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4528                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4529                    int64_t reverseIndex = fp->fInputIdx;
4530                    UChar32 c;
4531                    do {
4532                        U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4533                        if (c == opValue) {
4534                            break;
4535                        }
4536                    } while (reverseIndex > backSearchIndex);
4537                    if (c == opValue) {
4538                        fHitEnd = FALSE;
4539                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4540                        fp->fInputIdx = reverseIndex;
4541                        if (fp->fInputIdx > backSearchIndex) {
4542                            fp = StateSave(fp, fp->fPatIdx, status);
4543                        }
4544                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
4545                        break;
4546                    }
4547                }
4548            }
4549            #endif
4550
4551            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552            break;
4553
4554
4555        case URX_STRING:
4556            {
4557                // Test input against a literal string.
4558                // Strings require two slots in the compiled pattern, one for the
4559                //   offset to the string text, and one for the length.
4560                int32_t   stringStartIdx = opValue;
4561                int32_t   stringLen;
4562
4563                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4564                fp->fPatIdx++;
4565                opType    = URX_TYPE(op);
4566                stringLen = URX_VAL(op);
4567                U_ASSERT(opType == URX_STRING_LEN);
4568                U_ASSERT(stringLen >= 2);
4569
4570                if (fp->fInputIdx + stringLen > fActiveLimit) {
4571                    // No match.  String is longer than the remaining input text.
4572                    fHitEnd = TRUE;          //   TODO:  See ticket 6074
4573                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4574                    break;
4575                }
4576
4577                const UChar * pInp = inputBuf + fp->fInputIdx;
4578                const UChar * pPat = litText+stringStartIdx;
4579                const UChar * pEnd = pInp + stringLen;
4580                UBool success = FALSE;
4581                for(;;) {
4582                    if (*pInp == *pPat) {
4583                        pInp++;
4584                        pPat++;
4585                        if (pInp == pEnd) {
4586                            // Successful Match.
4587                            success = TRUE;
4588                            break;
4589                        }
4590                    } else {
4591                        // Match failed.
4592                        break;
4593                    }
4594                }
4595
4596                if (success) {
4597                    fp->fInputIdx += stringLen;
4598                } else {
4599                    #ifdef REGEX_SMART_BACKTRACKING
4600                    if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4601                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4602                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4603                            // Reset to last start point
4604                            int64_t reverseIndex = fp->fInputIdx;
4605                            UChar32 c;
4606                            pPat = litText+stringStartIdx;
4607
4608                            // Search backwards for a possible start
4609                            do {
4610                                U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4611                                if ((U_IS_BMP(c) && *pPat == c) ||
4612                                    (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
4613                                    success = TRUE;
4614                                    break;
4615                                }
4616                            } while (reverseIndex > backSearchIndex);
4617
4618                            // And try again
4619                            if (success) {
4620                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4621                                fp->fInputIdx = reverseIndex;
4622                                if (fp->fInputIdx > backSearchIndex) {
4623                                    fp = StateSave(fp, fp->fPatIdx, status);
4624                                }
4625                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
4626                                break;
4627                            }
4628                        }
4629                    }
4630                    #endif
4631                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4632                }
4633            }
4634            break;
4635
4636
4637        case URX_STATE_SAVE:
4638            fp = StateSave(fp, opValue, status);
4639            break;
4640
4641
4642        case URX_END:
4643            // The match loop will exit via this path on a successful match,
4644            //   when we reach the end of the pattern.
4645            if (toEnd && fp->fInputIdx != fActiveLimit) {
4646                // The pattern matched, but not to the end of input.  Try some more.
4647                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4648                break;
4649            }
4650            isMatch = TRUE;
4651            goto  breakFromLoop;
4652
4653            // Start and End Capture stack frame variables are laid out out like this:
4654            //  fp->fExtra[opValue]  - The start of a completed capture group
4655            //             opValue+1 - The end   of a completed capture group
4656            //             opValue+2 - the start of a capture group whose end
4657            //                          has not yet been reached (and might not ever be).
4658        case URX_START_CAPTURE:
4659            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4660            fp->fExtra[opValue+2] = fp->fInputIdx;
4661            break;
4662
4663
4664        case URX_END_CAPTURE:
4665            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4666            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4667            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4668            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4669            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4670            break;
4671
4672
4673        case URX_DOLLAR:                   //  $, test for End of line
4674            //     or for position before new line at end of input
4675            if (fp->fInputIdx < fAnchorLimit-2) {
4676                // We are no where near the end of input.  Fail.
4677                //   This is the common case.  Keep it first.
4678                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4679                break;
4680            }
4681            if (fp->fInputIdx >= fAnchorLimit) {
4682                // We really are at the end of input.  Success.
4683                fHitEnd = TRUE;
4684                fRequireEnd = TRUE;
4685                break;
4686            }
4687
4688            // If we are positioned just before a new-line that is located at the
4689            //   end of input, succeed.
4690            if (fp->fInputIdx == fAnchorLimit-1) {
4691                UChar32 c;
4692                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4693
4694                if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4695                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4696                        // At new-line at end of input. Success
4697                        fHitEnd = TRUE;
4698                        fRequireEnd = TRUE;
4699                        break;
4700                    }
4701                }
4702            } else if (fp->fInputIdx == fAnchorLimit-2 &&
4703                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4704                    fHitEnd = TRUE;
4705                    fRequireEnd = TRUE;
4706                    break;                         // At CR/LF at end of input.  Success
4707            }
4708
4709            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4710
4711            break;
4712
4713
4714        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4715            if (fp->fInputIdx >= fAnchorLimit-1) {
4716                // Either at the last character of input, or off the end.
4717                if (fp->fInputIdx == fAnchorLimit-1) {
4718                    // At last char of input.  Success if it's a new line.
4719                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4720                        fHitEnd = TRUE;
4721                        fRequireEnd = TRUE;
4722                        break;
4723                    }
4724                } else {
4725                    // Off the end of input.  Success.
4726                    fHitEnd = TRUE;
4727                    fRequireEnd = TRUE;
4728                    break;
4729                }
4730            }
4731
4732            // Not at end of input.  Back-track out.
4733            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4734            break;
4735
4736
4737        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4738            {
4739                if (fp->fInputIdx >= fAnchorLimit) {
4740                    // We really are at the end of input.  Success.
4741                    fHitEnd = TRUE;
4742                    fRequireEnd = TRUE;
4743                    break;
4744                }
4745                // If we are positioned just before a new-line, succeed.
4746                // It makes no difference where the new-line is within the input.
4747                UChar32 c = inputBuf[fp->fInputIdx];
4748                if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4749                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4750                    //  In multi-line mode, hitting a new-line just before the end of input does not
4751                    //   set the hitEnd or requireEnd flags
4752                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4753                        break;
4754                    }
4755                }
4756                // not at a new line.  Fail.
4757                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4758            }
4759            break;
4760
4761
4762        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4763            {
4764                if (fp->fInputIdx >= fAnchorLimit) {
4765                    // We really are at the end of input.  Success.
4766                    fHitEnd = TRUE;
4767                    fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4768                    break;               //   adding a new-line would not lose the match.
4769                }
4770                // If we are not positioned just before a new-line, the test fails; backtrack out.
4771                // It makes no difference where the new-line is within the input.
4772                if (inputBuf[fp->fInputIdx] != 0x0a) {
4773                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4774                }
4775            }
4776            break;
4777
4778
4779        case URX_CARET:                    //  ^, test for start of line
4780            if (fp->fInputIdx != fAnchorStart) {
4781                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4782            }
4783            break;
4784
4785
4786        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4787            {
4788                if (fp->fInputIdx == fAnchorStart) {
4789                    // We are at the start input.  Success.
4790                    break;
4791                }
4792                // Check whether character just before the current pos is a new-line
4793                //   unless we are at the end of input
4794                UChar  c = inputBuf[fp->fInputIdx - 1];
4795                if ((fp->fInputIdx < fAnchorLimit) &&
4796                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4797                    //  It's a new-line.  ^ is true.  Success.
4798                    //  TODO:  what should be done with positions between a CR and LF?
4799                    break;
4800                }
4801                // Not at the start of a line.  Fail.
4802                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4803            }
4804            break;
4805
4806
4807        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4808            {
4809                U_ASSERT(fp->fInputIdx >= fAnchorStart);
4810                if (fp->fInputIdx <= fAnchorStart) {
4811                    // We are at the start input.  Success.
4812                    break;
4813                }
4814                // Check whether character just before the current pos is a new-line
4815                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4816                UChar  c = inputBuf[fp->fInputIdx - 1];
4817                if (c != 0x0a) {
4818                    // Not at the start of a line.  Back-track out.
4819                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4820                }
4821            }
4822            break;
4823
4824        case URX_BACKSLASH_B:          // Test for word boundaries
4825            {
4826                UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4827                success ^= (opValue != 0);     // flip sense for \B
4828                if (!success) {
4829                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4830                }
4831            }
4832            break;
4833
4834
4835        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4836            {
4837                UBool success = isUWordBoundary(fp->fInputIdx);
4838                success ^= (opValue != 0);     // flip sense for \B
4839                if (!success) {
4840                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4841                }
4842            }
4843            break;
4844
4845
4846        case URX_BACKSLASH_D:            // Test for decimal digit
4847            {
4848                if (fp->fInputIdx >= fActiveLimit) {
4849                    fHitEnd = TRUE;
4850                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4851                    break;
4852                }
4853
4854                UChar32 c;
4855                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4856                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4857                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4858                success ^= (opValue != 0);        // flip sense for \D
4859                if (!success) {
4860                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4861                }
4862            }
4863            break;
4864
4865
4866        case URX_BACKSLASH_G:          // Test for position at end of previous match
4867            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4868                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4869            }
4870            break;
4871
4872
4873        case URX_BACKSLASH_X:
4874        //  Match a Grapheme, as defined by Unicode TR 29.
4875        //  Differs slightly from Perl, which consumes combining marks independently
4876        //    of context.
4877        {
4878
4879            // Fail if at end of input
4880            if (fp->fInputIdx >= fActiveLimit) {
4881                fHitEnd = TRUE;
4882                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4883                break;
4884            }
4885
4886            // Examine (and consume) the current char.
4887            //   Dispatch into a little state machine, based on the char.
4888            UChar32  c;
4889            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4890            UnicodeSet **sets = fPattern->fStaticSets;
4891            if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4892            if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4893            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4894            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4895            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4896            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4897            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4898            goto GC_Extend;
4899
4900
4901
4902GC_L:
4903            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4904            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4905            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4906            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4907            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4908            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4909            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4910            goto GC_Extend;
4911
4912GC_V:
4913            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4914            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4915            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4916            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4917            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4918            goto GC_Extend;
4919
4920GC_T:
4921            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4922            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4923            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4924            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4925            goto GC_Extend;
4926
4927GC_Extend:
4928            // Combining characters are consumed here
4929            for (;;) {
4930                if (fp->fInputIdx >= fActiveLimit) {
4931                    break;
4932                }
4933                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4934                if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4935                    U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4936                    break;
4937                }
4938            }
4939            goto GC_Done;
4940
4941GC_Control:
4942            // Most control chars stand alone (don't combine with combining chars),
4943            //   except for that CR/LF sequence is a single grapheme cluster.
4944            if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4945                fp->fInputIdx++;
4946            }
4947
4948GC_Done:
4949            if (fp->fInputIdx >= fActiveLimit) {
4950                fHitEnd = TRUE;
4951            }
4952            break;
4953        }
4954
4955
4956
4957
4958        case URX_BACKSLASH_Z:          // Test for end of Input
4959            if (fp->fInputIdx < fAnchorLimit) {
4960                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4961            } else {
4962                fHitEnd = TRUE;
4963                fRequireEnd = TRUE;
4964            }
4965            break;
4966
4967
4968
4969        case URX_STATIC_SETREF:
4970            {
4971                // Test input character against one of the predefined sets
4972                //    (Word Characters, for example)
4973                // The high bit of the op value is a flag for the match polarity.
4974                //    0:   success if input char is in set.
4975                //    1:   success if input char is not in set.
4976                if (fp->fInputIdx >= fActiveLimit) {
4977                    fHitEnd = TRUE;
4978                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4979                    break;
4980                }
4981
4982                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4983                opValue &= ~URX_NEG_SET;
4984                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4985
4986                UChar32 c;
4987                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4988                if (c < 256) {
4989                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4990                    if (s8->contains(c)) {
4991                        success = !success;
4992                    }
4993                } else {
4994                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
4995                    if (s->contains(c)) {
4996                        success = !success;
4997                    }
4998                }
4999                if (!success) {
5000                    #ifdef REGEX_SMART_BACKTRACKING
5001                    if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5002                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5003                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5004                            // Try to find it, backwards
5005                            int64_t reverseIndex = fp->fInputIdx;
5006                            U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5007                            success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
5008                            do {
5009                                U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5010                                if (c < 256) {
5011                                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5012                                    if (s8->contains(c)) {
5013                                        success = !success;
5014                                    }
5015                                } else {
5016                                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
5017                                    if (s->contains(c)) {
5018                                        success = !success;
5019                                    }
5020                                }
5021                            } while (reverseIndex > backSearchIndex && !success);
5022
5023                            if (success) {
5024                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5025                                fp->fInputIdx = reverseIndex;
5026                                if (fp->fInputIdx > backSearchIndex) {
5027                                    fp = StateSave(fp, fp->fPatIdx, status);
5028                                }
5029                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
5030                                break;
5031                            }
5032                        }
5033                    }
5034                    #endif
5035                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5036                }
5037            }
5038            break;
5039
5040
5041        case URX_STAT_SETREF_N:
5042            {
5043                // Test input character for NOT being a member of  one of
5044                //    the predefined sets (Word Characters, for example)
5045                if (fp->fInputIdx >= fActiveLimit) {
5046                    fHitEnd = TRUE;
5047                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5048                    break;
5049                }
5050
5051                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5052
5053                UChar32  c;
5054                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5055                if (c < 256) {
5056                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5057                    if (s8->contains(c) == FALSE) {
5058                        break;
5059                    }
5060                } else {
5061                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
5062                    if (s->contains(c) == FALSE) {
5063                        break;
5064                    }
5065                }
5066
5067                #ifdef REGEX_SMART_BACKTRACKING
5068                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5069                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5070                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5071                        // Try to find it, backwards
5072                        int64_t reverseIndex = fp->fInputIdx;
5073                        U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5074                        UBool success = FALSE;
5075                        do {
5076                            U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5077                            if (c < 256) {
5078                                Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5079                                if (s8->contains(c) == FALSE) {
5080                                    success = TRUE;
5081                                    break;
5082                                }
5083                            } else {
5084                                const UnicodeSet *s = fPattern->fStaticSets[opValue];
5085                                if (s->contains(c) == FALSE) {
5086                                    success = TRUE;
5087                                    break;
5088                                }
5089                            }
5090                        } while (reverseIndex > backSearchIndex);
5091
5092                        if (success) {
5093                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5094                            fp->fInputIdx = reverseIndex;
5095                            if (fp->fInputIdx > backSearchIndex) {
5096                                fp = StateSave(fp, fp->fPatIdx, status);
5097                            }
5098                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
5099                            break;
5100                        }
5101                    }
5102                }
5103                #endif
5104                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5105            }
5106            break;
5107
5108
5109        case URX_SETREF:
5110            {
5111                if (fp->fInputIdx >= fActiveLimit) {
5112                    fHitEnd = TRUE;
5113                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5114                    break;
5115                }
5116
5117                U_ASSERT(opValue > 0 && opValue < sets->size());
5118
5119                // There is input left.  Pick up one char and test it for set membership.
5120                UChar32  c;
5121                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5122                if (c<256) {
5123                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5124                    if (s8->contains(c)) {
5125                        // The character is in the set.  A Match.
5126                        break;
5127                    }
5128                } else {
5129                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5130                    if (s->contains(c)) {
5131                        // The character is in the set.  A Match.
5132                        break;
5133                    }
5134                }
5135
5136                // the character wasn't in the set.
5137                #ifdef REGEX_SMART_BACKTRACKING
5138                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5139                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5140                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5141                        // Try to find it, backwards
5142                        int64_t reverseIndex = fp->fInputIdx;
5143                        U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5144                        UBool success = FALSE;
5145                        do {
5146                            U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5147                            if (c < 256) {
5148                                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5149                                if (s8->contains(c)) {
5150                                    success = TRUE;
5151                                    break;
5152                                }
5153                            } else {
5154                                UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5155                                if (s->contains(c)) {
5156                                    success = TRUE;
5157                                    break;
5158                                }
5159                            }
5160                        } while (reverseIndex > backSearchIndex);
5161
5162                        if (success) {
5163                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5164                            fp->fInputIdx = reverseIndex;
5165                            if (fp->fInputIdx > reverseIndex) {
5166                                fp = StateSave(fp, fp->fPatIdx, status);
5167                            }
5168                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
5169                            break;
5170                        }
5171                    }
5172                }
5173                #endif
5174                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5175            }
5176            break;
5177
5178
5179        case URX_DOTANY:
5180            {
5181                // . matches anything, but stops at end-of-line.
5182                if (fp->fInputIdx >= fActiveLimit) {
5183                    // At end of input.  Match failed.  Backtrack out.
5184                    fHitEnd = TRUE;
5185                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5186                    break;
5187                }
5188
5189                // There is input left.  Advance over one char, unless we've hit end-of-line
5190                UChar32  c;
5191                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5192                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
5193                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
5194                    // End of line in normal mode.   . does not match.
5195                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5196                    break;
5197                }
5198            }
5199            break;
5200
5201
5202        case URX_DOTANY_ALL:
5203            {
5204                // . in dot-matches-all (including new lines) mode
5205                if (fp->fInputIdx >= fActiveLimit) {
5206                    // At end of input.  Match failed.  Backtrack out.
5207                    fHitEnd = TRUE;
5208                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5209                    break;
5210                }
5211
5212                // There is input left.  Advance over one char, except if we are
5213                //   at a cr/lf, advance over both of them.
5214                UChar32 c;
5215                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5216                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5217                    // In the case of a CR/LF, we need to advance over both.
5218                    if (inputBuf[fp->fInputIdx] == 0x0a) {
5219                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5220                    }
5221                }
5222            }
5223            break;
5224
5225
5226        case URX_DOTANY_UNIX:
5227            {
5228                // '.' operator, matches all, but stops at end-of-line.
5229                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
5230                if (fp->fInputIdx >= fActiveLimit) {
5231                    // At end of input.  Match failed.  Backtrack out.
5232                    fHitEnd = TRUE;
5233                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5234                    break;
5235                }
5236
5237                // There is input left.  Advance over one char, unless we've hit end-of-line
5238                UChar32 c;
5239                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5240                if (c == 0x0a) {
5241                    // End of line in normal mode.   '.' does not match the \n
5242                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5243                }
5244            }
5245            break;
5246
5247
5248        case URX_JMP:
5249            fp->fPatIdx = opValue;
5250            break;
5251
5252        case URX_FAIL:
5253            isMatch = FALSE;
5254            goto breakFromLoop;
5255
5256        case URX_JMP_SAV:
5257            U_ASSERT(opValue < fPattern->fCompiledPat->size());
5258            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5259            fp->fPatIdx = opValue;                         // Then JMP.
5260            break;
5261
5262        case URX_JMP_SAV_X:
5263            // This opcode is used with (x)+, when x can match a zero length string.
5264            // Same as JMP_SAV, except conditional on the match having made forward progress.
5265            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5266            //   data address of the input position at the start of the loop.
5267            {
5268                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5269                int32_t  stoOp = (int32_t)pat[opValue-1];
5270                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5271                int32_t  frameLoc = URX_VAL(stoOp);
5272                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5273                int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5274                U_ASSERT(prevInputIdx <= fp->fInputIdx);
5275                if (prevInputIdx < fp->fInputIdx) {
5276                    // The match did make progress.  Repeat the loop.
5277                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5278                    fp->fPatIdx = opValue;
5279                    fp->fExtra[frameLoc] = fp->fInputIdx;
5280                }
5281                // If the input position did not advance, we do nothing here,
5282                //   execution will fall out of the loop.
5283            }
5284            break;
5285
5286        case URX_CTR_INIT:
5287            {
5288                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5289                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5290
5291                // Pick up the three extra operands that CTR_INIT has, and
5292                //    skip the pattern location counter past
5293                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5294                fp->fPatIdx += 3;
5295                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5296                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5297                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5298                U_ASSERT(minCount>=0);
5299                U_ASSERT(maxCount>=minCount || maxCount==-1);
5300                U_ASSERT(loopLoc>fp->fPatIdx);
5301
5302                if (minCount == 0) {
5303                    fp = StateSave(fp, loopLoc+1, status);
5304                }
5305                if (maxCount == 0) {
5306                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5307                }
5308            }
5309            break;
5310
5311        case URX_CTR_LOOP:
5312            {
5313                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5314                int32_t initOp = (int32_t)pat[opValue];
5315                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5316                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5317                int32_t minCount  = (int32_t)pat[opValue+2];
5318                int32_t maxCount  = (int32_t)pat[opValue+3];
5319                // Increment the counter.  Note: we DIDN'T worry about counter
5320                //   overflow, since the data comes from UnicodeStrings, which
5321                //   stores its length in an int32_t. Do we have to think about
5322                //   this now that we're using UText? Probably not, since the length
5323                //   in UChar32s is still an int32_t.
5324                (*pCounter)++;
5325                U_ASSERT(*pCounter > 0);
5326                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5327                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
5328                    break;
5329                }
5330                if (*pCounter >= minCount) {
5331                    fp = StateSave(fp, fp->fPatIdx, status);
5332                }
5333                fp->fPatIdx = opValue + 4;    // Loop back.
5334            }
5335            break;
5336
5337        case URX_CTR_INIT_NG:
5338            {
5339                // Initialize a non-greedy loop
5340                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5341                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5342
5343                // Pick up the three extra operands that CTR_INIT has, and
5344                //    skip the pattern location counter past
5345                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5346                fp->fPatIdx += 3;
5347                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5348                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5349                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5350                U_ASSERT(minCount>=0);
5351                U_ASSERT(maxCount>=minCount || maxCount==-1);
5352                U_ASSERT(loopLoc>fp->fPatIdx);
5353
5354                if (minCount == 0) {
5355                    if (maxCount != 0) {
5356                        fp = StateSave(fp, fp->fPatIdx, status);
5357                    }
5358                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5359                }
5360            }
5361            break;
5362
5363        case URX_CTR_LOOP_NG:
5364            {
5365                // Non-greedy {min, max} loops
5366                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5367                int32_t initOp = (int32_t)pat[opValue];
5368                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5369                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5370                int32_t minCount  = (int32_t)pat[opValue+2];
5371                int32_t maxCount  = (int32_t)pat[opValue+3];
5372                // Increment the counter.  Note: we DIDN'T worry about counter
5373                //   overflow, since the data comes from UnicodeStrings, which
5374                //   stores its length in an int32_t. Do we have to think about
5375                //   this now that we're using UText? Probably not, since the length
5376                //   in UChar32s is still an int32_t.
5377                (*pCounter)++;
5378                U_ASSERT(*pCounter > 0);
5379
5380                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5381                    // The loop has matched the maximum permitted number of times.
5382                    //   Break out of here with no action.  Matching will
5383                    //   continue with the following pattern.
5384                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
5385                    break;
5386                }
5387
5388                if (*pCounter < minCount) {
5389                    // We haven't met the minimum number of matches yet.
5390                    //   Loop back for another one.
5391                    fp->fPatIdx = opValue + 4;    // Loop back.
5392                } else {
5393                    // We do have the minimum number of matches.
5394                    //   Fall into the following pattern, but first do
5395                    //   a state save to the top of the loop, so that a failure
5396                    //   in the following pattern will try another iteration of the loop.
5397                    fp = StateSave(fp, opValue + 4, status);
5398                }
5399            }
5400            break;
5401
5402        case URX_STO_SP:
5403            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5404            fData[opValue] = fStack->size();
5405            break;
5406
5407        case URX_LD_SP:
5408            {
5409                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5410                int32_t newStackSize = (int32_t)fData[opValue];
5411                U_ASSERT(newStackSize <= fStack->size());
5412                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5413                if (newFP == (int64_t *)fp) {
5414                    break;
5415                }
5416                int32_t i;
5417                for (i=0; i<fFrameSize; i++) {
5418                    newFP[i] = ((int64_t *)fp)[i];
5419                }
5420                fp = (REStackFrame *)newFP;
5421                fStack->setSize(newStackSize);
5422            }
5423            break;
5424
5425        case URX_BACKREF:
5426        case URX_BACKREF_I:
5427            {
5428                U_ASSERT(opValue < fFrameSize);
5429                int64_t groupStartIdx = fp->fExtra[opValue];
5430                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5431                U_ASSERT(groupStartIdx <= groupEndIdx);
5432                int64_t len = groupEndIdx-groupStartIdx;
5433                if (groupStartIdx < 0) {
5434                    // This capture group has not participated in the match thus far,
5435                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5436                }
5437
5438                if (len == 0) {
5439                        //   The capture group match was of an empty string.
5440                        //   Verified by testing:  Perl matches succeed in this case, so
5441                        //   we do too.
5442                        break;
5443                    }
5444
5445                UBool  haveMatch = FALSE;
5446                if (fp->fInputIdx + len <= fActiveLimit) {
5447                    if (opType == URX_BACKREF) {
5448                        if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
5449                            haveMatch = TRUE;
5450                        }
5451                    } else {
5452                        if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
5453                                  (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
5454                            haveMatch = TRUE;
5455                        }
5456                    }
5457                } else {
5458                    // TODO: probably need to do a partial string comparison, and only
5459                    //       set HitEnd if the available input matched.  Ticket #6074
5460                    fHitEnd = TRUE;
5461                }
5462                if (haveMatch) {
5463                    fp->fInputIdx += len;     // Match.  Advance current input position.
5464                } else {
5465                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5466                }
5467            }
5468            break;
5469
5470        case URX_STO_INP_LOC:
5471            {
5472                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5473                fp->fExtra[opValue] = fp->fInputIdx;
5474            }
5475            break;
5476
5477        case URX_JMPX:
5478            {
5479                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5480                fp->fPatIdx += 1;
5481                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5482                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5483                int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5484                U_ASSERT(savedInputIdx <= fp->fInputIdx);
5485                if (savedInputIdx < fp->fInputIdx) {
5486                    fp->fPatIdx = opValue;                               // JMP
5487                } else {
5488                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5489                }
5490            }
5491            break;
5492
5493        case URX_LA_START:
5494            {
5495                // Entering a lookahead block.
5496                // Save Stack Ptr, Input Pos.
5497                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5498                fData[opValue]   = fStack->size();
5499                fData[opValue+1] = fp->fInputIdx;
5500                fActiveStart     = fLookStart;          // Set the match region change for
5501                fActiveLimit     = fLookLimit;          //   transparent bounds.
5502            }
5503            break;
5504
5505        case URX_LA_END:
5506            {
5507                // Leaving a look-ahead block.
5508                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5509                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5510                int32_t stackSize = fStack->size();
5511                int32_t newStackSize = (int32_t)fData[opValue];
5512                U_ASSERT(stackSize >= newStackSize);
5513                if (stackSize > newStackSize) {
5514                    // Copy the current top frame back to the new (cut back) top frame.
5515                    //   This makes the capture groups from within the look-ahead
5516                    //   expression available.
5517                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5518                    int32_t i;
5519                    for (i=0; i<fFrameSize; i++) {
5520                        newFP[i] = ((int64_t *)fp)[i];
5521                    }
5522                    fp = (REStackFrame *)newFP;
5523                    fStack->setSize(newStackSize);
5524                }
5525                fp->fInputIdx = fData[opValue+1];
5526
5527                // Restore the active region bounds in the input string; they may have
5528                //    been changed because of transparent bounds on a Region.
5529                fActiveStart = fRegionStart;
5530                fActiveLimit = fRegionLimit;
5531            }
5532            break;
5533
5534        case URX_ONECHAR_I:
5535            if (fp->fInputIdx < fActiveLimit) {
5536                UChar32 c;
5537                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5538                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5539                    break;
5540                }
5541            } else {
5542                fHitEnd = TRUE;
5543            }
5544
5545            #ifdef REGEX_SMART_BACKTRACKING
5546            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5547                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5548                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5549                    UBool success = FALSE;
5550                    int64_t reverseIndex = fp->fInputIdx;
5551                    UChar32 c;
5552                    while (reverseIndex > backSearchIndex) {
5553                        U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5554                        if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5555                            success = TRUE;
5556                            break;
5557                        } else if (c == U_SENTINEL) {
5558                            break;
5559                        }
5560                    }
5561                    if (success) {
5562                        fHitEnd = FALSE;
5563                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5564                        fp->fInputIdx = reverseIndex;
5565                        if (fp->fInputIdx > backSearchIndex) {
5566                            fp = StateSave(fp, fp->fPatIdx, status);
5567                        }
5568                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
5569                        break;
5570                    }
5571                }
5572            }
5573            #endif
5574
5575            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5576            break;
5577
5578        case URX_STRING_I:
5579            {
5580                // Test input against a literal string.
5581                // Strings require two slots in the compiled pattern, one for the
5582                //   offset to the string text, and one for the length.
5583                const UCaseProps *csp = ucase_getSingleton();
5584                {
5585                    int32_t stringStartIdx, stringLen;
5586                    stringStartIdx = opValue;
5587
5588                    op      = (int32_t)pat[fp->fPatIdx];
5589                    fp->fPatIdx++;
5590                    opType  = URX_TYPE(op);
5591                    opValue = URX_VAL(op);
5592                    U_ASSERT(opType == URX_STRING_LEN);
5593                    stringLen = opValue;
5594
5595                    const UChar *patternChars = litText+stringStartIdx;
5596                    const UChar *patternEnd = patternChars+stringLen;
5597
5598                    const UChar *foldChars = NULL;
5599                    int32_t foldOffset, foldLength;
5600                    UChar32 c;
5601                    UBool c_is_valid = FALSE;
5602
5603                    #ifdef REGEX_SMART_BACKTRACKING
5604                    int32_t originalInputIdx = fp->fInputIdx;
5605                    #endif
5606                    UBool success = TRUE;
5607
5608                    foldOffset = foldLength = 0;
5609
5610                    while (patternChars < patternEnd && success) {
5611                        if (fp->fInputIdx < fActiveLimit) {  // don't read past end of string
5612                            if(foldOffset < foldLength) {
5613                                U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5614                                c_is_valid = TRUE;
5615                            } else {
5616                                // test pre-condition of U16_NEXT: i < length
5617                                U_ASSERT(fp->fInputIdx < fActiveLimit);
5618                                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5619                                c_is_valid = TRUE;
5620                                foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5621                                if(foldLength >= 0) {
5622                                    if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5623                                        foldOffset = 0;
5624                                        U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5625                                    } else {
5626                                        c = foldLength;
5627                                        foldLength = foldOffset; // to avoid reading chars from the folding buffer
5628                                    }
5629                                }
5630                            }
5631                        } else {
5632                          c_is_valid = FALSE;
5633                        }
5634
5635                        if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
5636                            if (U_IS_BMP(c)) {
5637                                success = (*patternChars == c);
5638                                patternChars += 1;
5639                            } else if (patternChars+1 < patternEnd) {
5640                                success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
5641                                patternChars += 2;
5642                            }
5643                        } else {
5644                            success = FALSE;
5645                            fHitEnd = TRUE;          //   TODO:  See ticket 6074
5646                        }
5647                    }
5648
5649                    if (!success) {
5650                        #ifdef REGEX_SMART_BACKTRACKING
5651                        if (fp->fInputIdx > backSearchIndex && fStack->size()) {
5652                            REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5653                            if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5654                                // Reset to last start point
5655                                int64_t reverseIndex = originalInputIdx;
5656                                patternChars = litText+stringStartIdx;
5657
5658                                // Search backwards for a possible start
5659                                do {
5660                                    U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5661                                    foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5662                                    if(foldLength >= 0) {
5663                                        if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5664                                            foldOffset = 0;
5665                                            U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5666                                        } else {
5667                                            c = foldLength;
5668                                            foldLength = foldOffset; // to avoid reading chars from the folding buffer
5669                                        }
5670                                    }
5671
5672                                    if ((U_IS_BMP(c) && *patternChars == c) ||
5673                                           (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
5674                                        success = TRUE;
5675                                        break;
5676                                    }
5677                                } while (reverseIndex > backSearchIndex);
5678
5679                                // And try again
5680                                if (success) {
5681                                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5682                                    fp->fInputIdx = reverseIndex;
5683                                    if (fp->fInputIdx > backSearchIndex) {
5684                                        fp = StateSave(fp, fp->fPatIdx, status);
5685                                    }
5686                                    fp->fPatIdx++; // Skip the LOOP_C, we just did that
5687                                    break;
5688                                }
5689                            }
5690                        }
5691                        #endif
5692                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5693                    }
5694                }
5695            }
5696            break;
5697
5698        case URX_LB_START:
5699            {
5700                // Entering a look-behind block.
5701                // Save Stack Ptr, Input Pos.
5702                //   TODO:  implement transparent bounds.  Ticket #6067
5703                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5704                fData[opValue]   = fStack->size();
5705                fData[opValue+1] = fp->fInputIdx;
5706                // Init the variable containing the start index for attempted matches.
5707                fData[opValue+2] = -1;
5708                // Save input string length, then reset to pin any matches to end at
5709                //   the current position.
5710                fData[opValue+3] = fActiveLimit;
5711                fActiveLimit     = fp->fInputIdx;
5712            }
5713            break;
5714
5715
5716        case URX_LB_CONT:
5717            {
5718                // Positive Look-Behind, at top of loop checking for matches of LB expression
5719                //    at all possible input starting positions.
5720
5721                // Fetch the min and max possible match lengths.  They are the operands
5722                //   of this op in the pattern.
5723                int32_t minML = (int32_t)pat[fp->fPatIdx++];
5724                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5725                U_ASSERT(minML <= maxML);
5726                U_ASSERT(minML >= 0);
5727
5728                // Fetch (from data) the last input index where a match was attempted.
5729                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5730                int64_t  *lbStartIdx = &fData[opValue+2];
5731                if (*lbStartIdx < 0) {
5732                    // First time through loop.
5733                    *lbStartIdx = fp->fInputIdx - minML;
5734                } else {
5735                    // 2nd through nth time through the loop.
5736                    // Back up start position for match by one.
5737                    if (*lbStartIdx == 0) {
5738                        (*lbStartIdx)--;
5739                    } else {
5740                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5741                    }
5742                }
5743
5744                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5745                    // We have tried all potential match starting points without
5746                    //  getting a match.  Backtrack out, and out of the
5747                    //   Look Behind altogether.
5748                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5749                    int64_t restoreInputLen = fData[opValue+3];
5750                    U_ASSERT(restoreInputLen >= fActiveLimit);
5751                    U_ASSERT(restoreInputLen <= fInputLength);
5752                    fActiveLimit = restoreInputLen;
5753                    break;
5754                }
5755
5756                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5757                //      (successful match will fall off the end of the loop.)
5758                fp = StateSave(fp, fp->fPatIdx-3, status);
5759                fp->fInputIdx =  *lbStartIdx;
5760            }
5761            break;
5762
5763        case URX_LB_END:
5764            // End of a look-behind block, after a successful match.
5765            {
5766                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5767                if (fp->fInputIdx != fActiveLimit) {
5768                    //  The look-behind expression matched, but the match did not
5769                    //    extend all the way to the point that we are looking behind from.
5770                    //  FAIL out of here, which will take us back to the LB_CONT, which
5771                    //     will retry the match starting at another position or fail
5772                    //     the look-behind altogether, whichever is appropriate.
5773                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5774                    break;
5775                }
5776
5777                // Look-behind match is good.  Restore the orignal input string length,
5778                //   which had been truncated to pin the end of the lookbehind match to the
5779                //   position being looked-behind.
5780                int64_t originalInputLen = fData[opValue+3];
5781                U_ASSERT(originalInputLen >= fActiveLimit);
5782                U_ASSERT(originalInputLen <= fInputLength);
5783                fActiveLimit = originalInputLen;
5784            }
5785            break;
5786
5787
5788        case URX_LBN_CONT:
5789            {
5790                // Negative Look-Behind, at top of loop checking for matches of LB expression
5791                //    at all possible input starting positions.
5792
5793                // Fetch the extra parameters of this op.
5794                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5795                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5796                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5797                continueLoc = URX_VAL(continueLoc);
5798                U_ASSERT(minML <= maxML);
5799                U_ASSERT(minML >= 0);
5800                U_ASSERT(continueLoc > fp->fPatIdx);
5801
5802                // Fetch (from data) the last input index where a match was attempted.
5803                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5804                int64_t  *lbStartIdx = &fData[opValue+2];
5805                if (*lbStartIdx < 0) {
5806                    // First time through loop.
5807                    *lbStartIdx = fp->fInputIdx - minML;
5808                } else {
5809                    // 2nd through nth time through the loop.
5810                    // Back up start position for match by one.
5811                    if (*lbStartIdx == 0) {
5812                        (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5813                    } else {
5814                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5815                    }
5816                }
5817
5818                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5819                    // We have tried all potential match starting points without
5820                    //  getting a match, which means that the negative lookbehind as
5821                    //  a whole has succeeded.  Jump forward to the continue location
5822                    int64_t restoreInputLen = fData[opValue+3];
5823                    U_ASSERT(restoreInputLen >= fActiveLimit);
5824                    U_ASSERT(restoreInputLen <= fInputLength);
5825                    fActiveLimit = restoreInputLen;
5826                    fp->fPatIdx = continueLoc;
5827                    break;
5828                }
5829
5830                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5831                //      (successful match will cause a FAIL out of the loop altogether.)
5832                fp = StateSave(fp, fp->fPatIdx-4, status);
5833                fp->fInputIdx =  *lbStartIdx;
5834            }
5835            break;
5836
5837        case URX_LBN_END:
5838            // End of a negative look-behind block, after a successful match.
5839            {
5840                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5841                if (fp->fInputIdx != fActiveLimit) {
5842                    //  The look-behind expression matched, but the match did not
5843                    //    extend all the way to the point that we are looking behind from.
5844                    //  FAIL out of here, which will take us back to the LB_CONT, which
5845                    //     will retry the match starting at another position or succeed
5846                    //     the look-behind altogether, whichever is appropriate.
5847                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5848                    break;
5849                }
5850
5851                // Look-behind expression matched, which means look-behind test as
5852                //   a whole Fails
5853
5854                //   Restore the orignal input string length, which had been truncated
5855                //   inorder to pin the end of the lookbehind match
5856                //   to the position being looked-behind.
5857                int64_t originalInputLen = fData[opValue+3];
5858                U_ASSERT(originalInputLen >= fActiveLimit);
5859                U_ASSERT(originalInputLen <= fInputLength);
5860                fActiveLimit = originalInputLen;
5861
5862                // Restore original stack position, discarding any state saved
5863                //   by the successful pattern match.
5864                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5865                int32_t newStackSize = (int32_t)fData[opValue];
5866                U_ASSERT(fStack->size() > newStackSize);
5867                fStack->setSize(newStackSize);
5868
5869                //  FAIL, which will take control back to someplace
5870                //  prior to entering the look-behind test.
5871                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5872            }
5873            break;
5874
5875
5876        case URX_LOOP_SR_I:
5877            // Loop Initialization for the optimized implementation of
5878            //     [some character set]*
5879            //   This op scans through all matching input.
5880            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5881            {
5882                U_ASSERT(opValue > 0 && opValue < sets->size());
5883                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5884                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5885
5886                // Loop through input, until either the input is exhausted or
5887                //   we reach a character that is not a member of the set.
5888                int32_t ix = (int32_t)fp->fInputIdx;
5889                for (;;) {
5890                    if (ix >= fActiveLimit) {
5891                        fHitEnd = TRUE;
5892                        break;
5893                    }
5894                    UChar32   c;
5895                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
5896                    if (c<256) {
5897                        if (s8->contains(c) == FALSE) {
5898                            U16_BACK_1(inputBuf, 0, ix);
5899                            break;
5900                        }
5901                    } else {
5902                        if (s->contains(c) == FALSE) {
5903                            U16_BACK_1(inputBuf, 0, ix);
5904                            break;
5905                        }
5906                    }
5907                }
5908
5909                // If there were no matching characters, skip over the loop altogether.
5910                //   The loop doesn't run at all, a * op always succeeds.
5911                if (ix == fp->fInputIdx) {
5912                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5913                    break;
5914                }
5915
5916                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5917                //   must follow.  It's operand is the stack location
5918                //   that holds the starting input index for the match of this [set]*
5919                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5920                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5921                int32_t stackLoc = URX_VAL(loopcOp);
5922                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5923                fp->fExtra[stackLoc] = fp->fInputIdx;
5924                #ifdef REGEX_SMART_BACKTRACKING
5925                backSearchIndex = fp->fInputIdx;
5926                #endif
5927                fp->fInputIdx = ix;
5928
5929                // Save State to the URX_LOOP_C op that follows this one,
5930                //   so that match failures in the following code will return to there.
5931                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5932                fp = StateSave(fp, fp->fPatIdx, status);
5933                fp->fPatIdx++;
5934            }
5935            break;
5936
5937
5938        case URX_LOOP_DOT_I:
5939            // Loop Initialization for the optimized implementation of .*
5940            //   This op scans through all remaining input.
5941            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5942            {
5943                // Loop through input until the input is exhausted (we reach an end-of-line)
5944                // In DOTALL mode, we can just go straight to the end of the input.
5945                int32_t ix;
5946                if ((opValue & 1) == 1) {
5947                    // Dot-matches-All mode.  Jump straight to the end of the string.
5948                    ix = (int32_t)fActiveLimit;
5949                    fHitEnd = TRUE;
5950                } else {
5951                    // NOT DOT ALL mode.  Line endings do not match '.'
5952                    // Scan forward until a line ending or end of input.
5953                    ix = (int32_t)fp->fInputIdx;
5954                    for (;;) {
5955                        if (ix >= fActiveLimit) {
5956                            fHitEnd = TRUE;
5957                            break;
5958                        }
5959                        UChar32   c;
5960                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5961                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5962                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5963                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5964                                   ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5965                                //  char is a line ending.  Put the input pos back to the
5966                                //    line ending char, and exit the scanning loop.
5967                                U16_BACK_1(inputBuf, 0, ix);
5968                                break;
5969                            }
5970                        }
5971                    }
5972                }
5973
5974                // If there were no matching characters, skip over the loop altogether.
5975                //   The loop doesn't run at all, a * op always succeeds.
5976                if (ix == fp->fInputIdx) {
5977                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5978                    break;
5979                }
5980
5981                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5982                //   must follow.  It's operand is the stack location
5983                //   that holds the starting input index for the match of this .*
5984                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5985                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5986                int32_t stackLoc = URX_VAL(loopcOp);
5987                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5988                fp->fExtra[stackLoc] = fp->fInputIdx;
5989                #ifdef REGEX_SMART_BACKTRACKING
5990                backSearchIndex = fp->fInputIdx;
5991                #endif
5992                fp->fInputIdx = ix;
5993
5994                // Save State to the URX_LOOP_C op that follows this one,
5995                //   so that match failures in the following code will return to there.
5996                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5997                fp = StateSave(fp, fp->fPatIdx, status);
5998                fp->fPatIdx++;
5999            }
6000            break;
6001
6002
6003        case URX_LOOP_C:
6004            {
6005                U_ASSERT(opValue>=0 && opValue<fFrameSize);
6006                backSearchIndex = (int32_t)fp->fExtra[opValue];
6007                U_ASSERT(backSearchIndex <= fp->fInputIdx);
6008                if (backSearchIndex == fp->fInputIdx) {
6009                    // We've backed up the input idx to the point that the loop started.
6010                    // The loop is done.  Leave here without saving state.
6011                    //  Subsequent failures won't come back here.
6012                    break;
6013                }
6014                // Set up for the next iteration of the loop, with input index
6015                //   backed up by one from the last time through,
6016                //   and a state save to this instruction in case the following code fails again.
6017                //   (We're going backwards because this loop emulates stack unwinding, not
6018                //    the initial scan forward.)
6019                U_ASSERT(fp->fInputIdx > 0);
6020                UChar32 prevC;
6021                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
6022
6023                if (prevC == 0x0a &&
6024                    fp->fInputIdx > backSearchIndex &&
6025                    inputBuf[fp->fInputIdx-1] == 0x0d) {
6026                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
6027                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
6028                        // .*, stepping back over CRLF pair.
6029                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
6030                    }
6031                }
6032
6033
6034                fp = StateSave(fp, fp->fPatIdx-1, status);
6035            }
6036            break;
6037
6038
6039
6040        default:
6041            // Trouble.  The compiled pattern contains an entry with an
6042            //           unrecognized type tag.
6043            U_ASSERT(FALSE);
6044        }
6045
6046        if (U_FAILURE(status)) {
6047            isMatch = FALSE;
6048            break;
6049        }
6050    }
6051
6052breakFromLoop:
6053    fMatch = isMatch;
6054    if (isMatch) {
6055        fLastMatchEnd = fMatchEnd;
6056        fMatchStart   = startIdx;
6057        fMatchEnd     = fp->fInputIdx;
6058        if (fTraceDebug) {
6059            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
6060        }
6061    }
6062    else
6063    {
6064        if (fTraceDebug) {
6065            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6066        }
6067    }
6068
6069    fFrame = fp;                // The active stack frame when the engine stopped.
6070    //   Contains the capture group results that we need to
6071    //    access later.
6072
6073    return;
6074}
6075
6076
6077UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
6078
6079U_NAMESPACE_END
6080
6081#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
6082