1/*
2**************************************************************************
3*   Copyright (C) 2002-2010 International Business Machines Corporation  *
4*   and others. All rights reserved.                                     *
5**************************************************************************
6*/
7//
8//  file:  rematch.cpp
9//
10//         Contains the implementation of class RegexMatcher,
11//         which is one of the main API classes for the ICU regular expression package.
12//
13
14#include "unicode/utypes.h"
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/regex.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/ustring.h"
21#include "unicode/rbbi.h"
22#include "uassert.h"
23#include "cmemory.h"
24#include "uvector.h"
25#include "uvectr32.h"
26#include "uvectr64.h"
27#include "regeximp.h"
28#include "regexst.h"
29#include "regextxt.h"
30#include "ucase.h"
31
32// #include <malloc.h>        // Needed for heapcheck testing
33
34
35// Find progress callback
36// ----------------------
37// Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
38//
39#define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
40    (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
41
42
43// Smart Backtracking
44// ------------------
45// When a failure would go back to a LOOP_C instruction,
46// strings, characters, and setrefs scan backwards for a valid start
47// character themselves, pop the stack, and save state, emulating the
48// LOOP_C's effect but assured that the next character of input is a
49// possible matching character.
50//
51// Good idea in theory; unfortunately it only helps out a few specific
52// cases and slows the engine down a little in the rest.
53
54//#define REGEX_SMART_BACKTRACKING 1
55
56U_NAMESPACE_BEGIN
57
58// Default limit for the size of the back track stack, to avoid system
59//    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
60// This value puts ICU's limits higher than most other regexp implementations,
61//    which use recursion rather than the heap, and take more storage per
62//    backtrack point.
63//
64static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66// Time limit counter constant.
67//   Time limits for expression evaluation are in terms of quanta of work by
68//   the engine, each of which is 10,000 state saves.
69//   This constant determines that state saves per tick number.
70static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72//-----------------------------------------------------------------------------
73//
74//   Constructor and Destructor
75//
76//-----------------------------------------------------------------------------
77RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
78    fDeferredStatus = U_ZERO_ERROR;
79    init(fDeferredStatus);
80    if (U_FAILURE(fDeferredStatus)) {
81        return;
82    }
83    if (pat==NULL) {
84        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85        return;
86    }
87    fPattern = pat;
88    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89}
90
91
92
93RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94                           uint32_t flags, UErrorCode &status) {
95    init(status);
96    if (U_FAILURE(status)) {
97        return;
98    }
99    UParseError    pe;
100    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
101    fPattern           = fPatternOwned;
102
103    UText inputText = UTEXT_INITIALIZER;
104    utext_openConstUnicodeString(&inputText, &input, &status);
105    init2(&inputText, status);
106    utext_close(&inputText);
107
108    fInputUniStrMaybeMutable = TRUE;
109}
110
111
112RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113                           uint32_t flags, UErrorCode &status) {
114    init(status);
115    if (U_FAILURE(status)) {
116        return;
117    }
118    UParseError    pe;
119    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
120    if (U_FAILURE(status)) {
121        return;
122    }
123
124    fPattern           = fPatternOwned;
125    init2(input, status);
126}
127
128
129RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130                           uint32_t flags, UErrorCode &status) {
131    init(status);
132    if (U_FAILURE(status)) {
133        return;
134    }
135    UParseError    pe;
136    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
137    if (U_FAILURE(status)) {
138        return;
139    }
140    fPattern           = fPatternOwned;
141    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142}
143
144RegexMatcher::RegexMatcher(UText *regexp,
145                           uint32_t flags, UErrorCode &status) {
146    init(status);
147    if (U_FAILURE(status)) {
148        return;
149    }
150    UParseError    pe;
151    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
152        if (U_FAILURE(status)) {
153        return;
154    }
155
156    fPattern           = fPatternOwned;
157    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158}
159
160
161
162
163RegexMatcher::~RegexMatcher() {
164    delete fStack;
165    if (fData != fSmallData) {
166        uprv_free(fData);
167        fData = NULL;
168    }
169    if (fPatternOwned) {
170        delete fPatternOwned;
171        fPatternOwned = NULL;
172        fPattern = NULL;
173    }
174
175    if (fInput) {
176        delete fInput;
177    }
178    if (fInputText) {
179        utext_close(fInputText);
180    }
181    if (fAltInputText) {
182        utext_close(fAltInputText);
183    }
184
185    #if UCONFIG_NO_BREAK_ITERATION==0
186    delete fWordBreakItr;
187    #endif
188}
189
190//
191//   init()   common initialization for use by all constructors.
192//            Initialize all fields, get the object into a consistent state.
193//            This must be done even when the initial status shows an error,
194//            so that the object is initialized sufficiently well for the destructor
195//            to run safely.
196//
197void RegexMatcher::init(UErrorCode &status) {
198    fPattern           = NULL;
199    fPatternOwned      = NULL;
200    fFrameSize         = 0;
201    fRegionStart       = 0;
202    fRegionLimit       = 0;
203    fAnchorStart       = 0;
204    fAnchorLimit       = 0;
205    fLookStart         = 0;
206    fLookLimit         = 0;
207    fActiveStart       = 0;
208    fActiveLimit       = 0;
209    fTransparentBounds = FALSE;
210    fAnchoringBounds   = TRUE;
211    fMatch             = FALSE;
212    fMatchStart        = 0;
213    fMatchEnd          = 0;
214    fLastMatchEnd      = -1;
215    fAppendPosition    = 0;
216    fHitEnd            = FALSE;
217    fRequireEnd        = FALSE;
218    fStack             = NULL;
219    fFrame             = NULL;
220    fTimeLimit         = 0;
221    fTime              = 0;
222    fTickCounter       = 0;
223    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
224    fCallbackFn        = NULL;
225    fCallbackContext   = NULL;
226    fFindProgressCallbackFn      = NULL;
227    fFindProgressCallbackContext = NULL;
228    fTraceDebug        = FALSE;
229    fDeferredStatus    = status;
230    fData              = fSmallData;
231    fWordBreakItr      = NULL;
232
233    fStack             = new UVector64(status);
234    fInputText         = NULL;
235    fAltInputText      = NULL;
236    fInput             = NULL;
237    fInputLength       = 0;
238    fInputUniStrMaybeMutable = FALSE;
239
240    if (U_FAILURE(status)) {
241        fDeferredStatus = status;
242    }
243}
244
245//
246//  init2()   Common initialization for use by RegexMatcher constructors, part 2.
247//            This handles the common setup to be done after the Pattern is available.
248//
249void RegexMatcher::init2(UText *input, UErrorCode &status) {
250    if (U_FAILURE(status)) {
251        fDeferredStatus = status;
252        return;
253    }
254
255    if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256        fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257        if (fData == NULL) {
258            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259            return;
260        }
261    }
262
263    reset(input);
264    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
265    if (U_FAILURE(status)) {
266        fDeferredStatus = status;
267        return;
268    }
269}
270
271
272static const UChar BACKSLASH  = 0x5c;
273static const UChar DOLLARSIGN = 0x24;
274//--------------------------------------------------------------------------------
275//
276//    appendReplacement
277//
278//--------------------------------------------------------------------------------
279RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280                                              const UnicodeString &replacement,
281                                              UErrorCode &status) {
282    UText replacementText = UTEXT_INITIALIZER;
283
284    utext_openConstUnicodeString(&replacementText, &replacement, &status);
285    if (U_SUCCESS(status)) {
286        UText resultText = UTEXT_INITIALIZER;
287        utext_openUnicodeString(&resultText, &dest, &status);
288
289        if (U_SUCCESS(status)) {
290            appendReplacement(&resultText, &replacementText, status);
291            utext_close(&resultText);
292        }
293        utext_close(&replacementText);
294    }
295
296    return *this;
297}
298
299//
300//    appendReplacement, UText mode
301//
302RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303                                              UText *replacement,
304                                              UErrorCode &status) {
305    if (U_FAILURE(status)) {
306        return *this;
307    }
308    if (U_FAILURE(fDeferredStatus)) {
309        status = fDeferredStatus;
310        return *this;
311    }
312    if (fMatch == FALSE) {
313        status = U_REGEX_INVALID_STATE;
314        return *this;
315    }
316
317    // Copy input string from the end of previous match to start of current match
318    int64_t  destLen = utext_nativeLength(dest);
319    if (fMatchStart > fAppendPosition) {
320        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322                                     (int32_t)(fMatchStart-fAppendPosition), &status);
323        } else {
324            int32_t len16;
325            if (UTEXT_USES_U16(fInputText)) {
326                len16 = (int32_t)(fMatchStart-fAppendPosition);
327            } else {
328                UErrorCode lengthStatus = U_ZERO_ERROR;
329                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330            }
331            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332            if (inputChars == NULL) {
333                status = U_MEMORY_ALLOCATION_ERROR;
334                return *this;
335            }
336            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338            uprv_free(inputChars);
339        }
340    }
341    fAppendPosition = fMatchEnd;
342
343
344    // scan the replacement text, looking for substitutions ($n) and \escapes.
345    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
346    //         move entire ranges not containing substitutions.
347    UTEXT_SETNATIVEINDEX(replacement, 0);
348    UChar32 c = UTEXT_NEXT32(replacement);
349    while (c != U_SENTINEL) {
350        if (c == BACKSLASH) {
351            // Backslash Escape.  Copy the following char out without further checks.
352            //                    Note:  Surrogate pairs don't need any special handling
353            //                           The second half wont be a '$' or a '\', and
354            //                           will move to the dest normally on the next
355            //                           loop iteration.
356            c = UTEXT_CURRENT32(replacement);
357            if (c == U_SENTINEL) {
358                break;
359            }
360
361            if (c==0x55/*U*/ || c==0x75/*u*/) {
362                // We have a \udddd or \Udddddddd escape sequence.
363                int32_t offset = 0;
364                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
365                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
366                if (escapedChar != (UChar32)0xFFFFFFFF) {
367                    if (U_IS_BMP(escapedChar)) {
368                        UChar c16 = (UChar)escapedChar;
369                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
370                    } else {
371                        UChar surrogate[2];
372                        surrogate[0] = U16_LEAD(escapedChar);
373                        surrogate[1] = U16_TRAIL(escapedChar);
374                        if (U_SUCCESS(status)) {
375                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
376                        }
377                    }
378                    // TODO:  Report errors for mal-formed \u escapes?
379                    //        As this is, the original sequence is output, which may be OK.
380                    if (context.lastOffset == offset) {
381                        UTEXT_PREVIOUS32(replacement);
382                    } else if (context.lastOffset != offset-1) {
383                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
384                    }
385                }
386            } else {
387                UTEXT_NEXT32(replacement);
388                // Plain backslash escape.  Just put out the escaped character.
389                if (U_IS_BMP(c)) {
390                    UChar c16 = (UChar)c;
391                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
392                } else {
393                    UChar surrogate[2];
394                    surrogate[0] = U16_LEAD(c);
395                    surrogate[1] = U16_TRAIL(c);
396                    if (U_SUCCESS(status)) {
397                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
398                    }
399                }
400            }
401        } else if (c != DOLLARSIGN) {
402            // Normal char, not a $.  Copy it out without further checks.
403            if (U_IS_BMP(c)) {
404                UChar c16 = (UChar)c;
405                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
406            } else {
407                UChar surrogate[2];
408                surrogate[0] = U16_LEAD(c);
409                surrogate[1] = U16_TRAIL(c);
410                if (U_SUCCESS(status)) {
411                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
412                }
413            }
414        } else {
415            // We've got a $.  Pick up a capture group number if one follows.
416            // Consume at most the number of digits necessary for the largest capture
417            // number that is valid for this pattern.
418
419            int32_t numDigits = 0;
420            int32_t groupNum  = 0;
421            UChar32 digitC;
422            for (;;) {
423                digitC = UTEXT_CURRENT32(replacement);
424                if (digitC == U_SENTINEL) {
425                    break;
426                }
427                if (u_isdigit(digitC) == FALSE) {
428                    break;
429                }
430                UTEXT_NEXT32(replacement);
431                groupNum=groupNum*10 + u_charDigitValue(digitC);
432                numDigits++;
433                if (numDigits >= fPattern->fMaxCaptureDigits) {
434                    break;
435                }
436            }
437
438
439            if (numDigits == 0) {
440                // The $ didn't introduce a group number at all.
441                // Treat it as just part of the substitution text.
442                UChar c16 = DOLLARSIGN;
443                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
444            } else {
445                // Finally, append the capture group data to the destination.
446                destLen += appendGroup(groupNum, dest, status);
447                if (U_FAILURE(status)) {
448                    // Can fail if group number is out of range.
449                    break;
450                }
451            }
452        }
453
454        if (U_FAILURE(status)) {
455            break;
456        } else {
457            c = UTEXT_NEXT32(replacement);
458        }
459    }
460
461    return *this;
462}
463
464
465
466//--------------------------------------------------------------------------------
467//
468//    appendTail     Intended to be used in conjunction with appendReplacement()
469//                   To the destination string, append everything following
470//                   the last match position from the input string.
471//
472//                   Note:  Match ranges do not affect appendTail or appendReplacement
473//
474//--------------------------------------------------------------------------------
475UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
476    UErrorCode status = U_ZERO_ERROR;
477    UText resultText = UTEXT_INITIALIZER;
478    utext_openUnicodeString(&resultText, &dest, &status);
479
480    if (U_SUCCESS(status)) {
481        appendTail(&resultText, status);
482        utext_close(&resultText);
483    }
484
485    return dest;
486}
487
488//
489//   appendTail, UText mode
490//
491UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
492    UBool bailOut = FALSE;
493    if (U_FAILURE(status)) {
494        bailOut = TRUE;
495    }
496    if (U_FAILURE(fDeferredStatus)) {
497        status = fDeferredStatus;
498        bailOut = TRUE;
499    }
500
501    if (bailOut) {
502        //  dest must not be NULL
503        if (dest) {
504            utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status);
505            return dest;
506        }
507    }
508
509    if (fInputLength > fAppendPosition) {
510        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
511            int64_t destLen = utext_nativeLength(dest);
512            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
513                          (int32_t)(fInputLength-fAppendPosition), &status);
514        } else {
515            int32_t len16;
516            if (UTEXT_USES_U16(fInputText)) {
517                len16 = (int32_t)(fInputLength-fAppendPosition);
518            } else {
519                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
520                status = U_ZERO_ERROR; // buffer overflow
521            }
522
523            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
524            if (inputChars == NULL) {
525                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
526            } else {
527                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
528                int64_t destLen = utext_nativeLength(dest);
529                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
530                uprv_free(inputChars);
531            }
532        }
533    }
534    return dest;
535}
536
537
538
539//--------------------------------------------------------------------------------
540//
541//   end
542//
543//--------------------------------------------------------------------------------
544int32_t RegexMatcher::end(UErrorCode &err) const {
545    return end(0, err);
546}
547
548int64_t RegexMatcher::end64(UErrorCode &err) const {
549    return end64(0, err);
550}
551
552int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
553    if (U_FAILURE(err)) {
554        return -1;
555    }
556    if (fMatch == FALSE) {
557        err = U_REGEX_INVALID_STATE;
558        return -1;
559    }
560    if (group < 0 || group > fPattern->fGroupMap->size()) {
561        err = U_INDEX_OUTOFBOUNDS_ERROR;
562        return -1;
563    }
564    int64_t e = -1;
565    if (group == 0) {
566        e = fMatchEnd;
567    } else {
568        // Get the position within the stack frame of the variables for
569        //    this capture group.
570        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
571        U_ASSERT(groupOffset < fPattern->fFrameSize);
572        U_ASSERT(groupOffset >= 0);
573        e = fFrame->fExtra[groupOffset + 1];
574    }
575
576        return e;
577}
578
579int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
580    return (int32_t)end64(group, err);
581}
582
583
584//--------------------------------------------------------------------------------
585//
586//   find()
587//
588//--------------------------------------------------------------------------------
589UBool RegexMatcher::find() {
590    // Start at the position of the last match end.  (Will be zero if the
591    //   matcher has been reset.)
592    //
593    if (U_FAILURE(fDeferredStatus)) {
594        return FALSE;
595    }
596
597    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
598        return findUsingChunk();
599    }
600
601    int64_t startPos = fMatchEnd;
602    if (startPos==0) {
603        startPos = fActiveStart;
604    }
605
606    if (fMatch) {
607        // Save the position of any previous successful match.
608        fLastMatchEnd = fMatchEnd;
609
610        if (fMatchStart == fMatchEnd) {
611            // Previous match had zero length.  Move start position up one position
612            //  to avoid sending find() into a loop on zero-length matches.
613            if (startPos >= fActiveLimit) {
614                fMatch = FALSE;
615                fHitEnd = TRUE;
616                return FALSE;
617            }
618            UTEXT_SETNATIVEINDEX(fInputText, startPos);
619            UTEXT_NEXT32(fInputText);
620            startPos = UTEXT_GETNATIVEINDEX(fInputText);
621        }
622    } else {
623        if (fLastMatchEnd >= 0) {
624            // A previous find() failed to match.  Don't try again.
625            //   (without this test, a pattern with a zero-length match
626            //    could match again at the end of an input string.)
627            fHitEnd = TRUE;
628            return FALSE;
629        }
630    }
631
632
633    // Compute the position in the input string beyond which a match can not begin, because
634    //   the minimum length match would extend past the end of the input.
635    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
636    //          Be aware of possible overflows if making changes here.
637    int64_t testStartLimit;
638    if (UTEXT_USES_U16(fInputText)) {
639        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
640        if (startPos > testStartLimit) {
641            fMatch = FALSE;
642            fHitEnd = TRUE;
643            return FALSE;
644        }
645    } else {
646        // For now, let the matcher discover that it can't match on its own
647        // We don't know how long the match len is in native characters
648        testStartLimit = fActiveLimit;
649    }
650
651    UChar32  c;
652    U_ASSERT(startPos >= 0);
653
654    switch (fPattern->fStartType) {
655    case START_NO_INFO:
656        // No optimization was found.
657        //  Try a match at each input position.
658        for (;;) {
659            MatchAt(startPos, FALSE, fDeferredStatus);
660            if (U_FAILURE(fDeferredStatus)) {
661                return FALSE;
662            }
663            if (fMatch) {
664                return TRUE;
665            }
666            if (startPos >= testStartLimit) {
667                fHitEnd = TRUE;
668                return FALSE;
669            }
670            UTEXT_SETNATIVEINDEX(fInputText, startPos);
671            UTEXT_NEXT32(fInputText);
672            startPos = UTEXT_GETNATIVEINDEX(fInputText);
673            // Note that it's perfectly OK for a pattern to have a zero-length
674            //   match at the end of a string, so we must make sure that the loop
675            //   runs with startPos == testStartLimit the last time through.
676            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
677                return FALSE;
678        }
679        U_ASSERT(FALSE);
680
681    case START_START:
682        // Matches are only possible at the start of the input string
683        //   (pattern begins with ^ or \A)
684        if (startPos > fActiveStart) {
685            fMatch = FALSE;
686            return FALSE;
687        }
688        MatchAt(startPos, FALSE, fDeferredStatus);
689        if (U_FAILURE(fDeferredStatus)) {
690            return FALSE;
691        }
692        return fMatch;
693
694
695    case START_SET:
696        {
697            // Match may start on any char from a pre-computed set.
698            U_ASSERT(fPattern->fMinMatchLen > 0);
699            int64_t pos;
700            UTEXT_SETNATIVEINDEX(fInputText, startPos);
701            for (;;) {
702                c = UTEXT_NEXT32(fInputText);
703                pos = UTEXT_GETNATIVEINDEX(fInputText);
704                // c will be -1 (U_SENTINEL) at end of text, in which case we
705                // skip this next block (so we don't have a negative array index)
706                // and handle end of text in the following block.
707                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
708                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
709                    MatchAt(startPos, FALSE, fDeferredStatus);
710                    if (U_FAILURE(fDeferredStatus)) {
711                        return FALSE;
712                    }
713                    if (fMatch) {
714                        return TRUE;
715                    }
716                    UTEXT_SETNATIVEINDEX(fInputText, pos);
717                }
718                if (startPos >= testStartLimit) {
719                    fMatch = FALSE;
720                    fHitEnd = TRUE;
721                    return FALSE;
722                }
723                startPos = pos;
724	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
725                    return FALSE;
726            }
727        }
728        U_ASSERT(FALSE);
729
730    case START_STRING:
731    case START_CHAR:
732        {
733            // Match starts on exactly one char.
734            U_ASSERT(fPattern->fMinMatchLen > 0);
735            UChar32 theChar = fPattern->fInitialChar;
736            int64_t pos;
737            UTEXT_SETNATIVEINDEX(fInputText, startPos);
738            for (;;) {
739                c = UTEXT_NEXT32(fInputText);
740                pos = UTEXT_GETNATIVEINDEX(fInputText);
741                if (c == theChar) {
742                    MatchAt(startPos, FALSE, fDeferredStatus);
743                    if (U_FAILURE(fDeferredStatus)) {
744                        return FALSE;
745                    }
746                    if (fMatch) {
747                        return TRUE;
748                    }
749                    UTEXT_SETNATIVEINDEX(fInputText, pos);
750                }
751                if (startPos >= testStartLimit) {
752                    fMatch = FALSE;
753                    fHitEnd = TRUE;
754                    return FALSE;
755                }
756                startPos = pos;
757	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
758                    return FALSE;
759           }
760        }
761        U_ASSERT(FALSE);
762
763    case START_LINE:
764        {
765            UChar32  c;
766            if (startPos == fAnchorStart) {
767                MatchAt(startPos, FALSE, fDeferredStatus);
768                if (U_FAILURE(fDeferredStatus)) {
769                    return FALSE;
770                }
771                if (fMatch) {
772                    return TRUE;
773                }
774                UTEXT_SETNATIVEINDEX(fInputText, startPos);
775                c = UTEXT_NEXT32(fInputText);
776                startPos = UTEXT_GETNATIVEINDEX(fInputText);
777            } else {
778                UTEXT_SETNATIVEINDEX(fInputText, startPos);
779                c = UTEXT_PREVIOUS32(fInputText);
780                UTEXT_SETNATIVEINDEX(fInputText, startPos);
781            }
782
783            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
784                for (;;) {
785                    if (c == 0x0a) {
786                            MatchAt(startPos, FALSE, fDeferredStatus);
787                            if (U_FAILURE(fDeferredStatus)) {
788                                return FALSE;
789                            }
790                            if (fMatch) {
791                                return TRUE;
792                            }
793                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
794                    }
795                    if (startPos >= testStartLimit) {
796                        fMatch = FALSE;
797                        fHitEnd = TRUE;
798                        return FALSE;
799                    }
800                    c = UTEXT_NEXT32(fInputText);
801                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
802                    // Note that it's perfectly OK for a pattern to have a zero-length
803                    //   match at the end of a string, so we must make sure that the loop
804                    //   runs with startPos == testStartLimit the last time through.
805		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
806                        return FALSE;
807                }
808            } else {
809                for (;;) {
810                    if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
811                        ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
812                            if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
813                                UTEXT_NEXT32(fInputText);
814                                startPos = UTEXT_GETNATIVEINDEX(fInputText);
815                            }
816                            MatchAt(startPos, FALSE, fDeferredStatus);
817                            if (U_FAILURE(fDeferredStatus)) {
818                                return FALSE;
819                            }
820                            if (fMatch) {
821                                return TRUE;
822                            }
823                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
824                    }
825                    if (startPos >= testStartLimit) {
826                        fMatch = FALSE;
827                        fHitEnd = TRUE;
828                        return FALSE;
829                    }
830                    c = UTEXT_NEXT32(fInputText);
831                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
832                    // Note that it's perfectly OK for a pattern to have a zero-length
833                    //   match at the end of a string, so we must make sure that the loop
834                    //   runs with startPos == testStartLimit the last time through.
835		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
836                        return FALSE;
837                }
838            }
839        }
840
841    default:
842        U_ASSERT(FALSE);
843    }
844
845    U_ASSERT(FALSE);
846    return FALSE;
847}
848
849
850
851UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
852    if (U_FAILURE(status)) {
853        return FALSE;
854    }
855    if (U_FAILURE(fDeferredStatus)) {
856        status = fDeferredStatus;
857        return FALSE;
858    }
859    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
860                                          //        This will reset the region to be the full input length.
861    if (start < 0) {
862        status = U_INDEX_OUTOFBOUNDS_ERROR;
863        return FALSE;
864    }
865
866    int64_t nativeStart = start;
867    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
868        status = U_INDEX_OUTOFBOUNDS_ERROR;
869        return FALSE;
870    }
871    fMatchEnd = nativeStart;
872    return find();
873}
874
875
876//--------------------------------------------------------------------------------
877//
878//   findUsingChunk() -- like find(), but with the advance knowledge that the
879//                       entire string is available in the UText's chunk buffer.
880//
881//--------------------------------------------------------------------------------
882UBool RegexMatcher::findUsingChunk() {
883    // Start at the position of the last match end.  (Will be zero if the
884    //   matcher has been reset.
885    //
886
887    int32_t startPos = (int32_t)fMatchEnd;
888    if (startPos==0) {
889        startPos = (int32_t)fActiveStart;
890    }
891
892    const UChar *inputBuf = fInputText->chunkContents;
893
894    if (fMatch) {
895        // Save the position of any previous successful match.
896        fLastMatchEnd = fMatchEnd;
897
898        if (fMatchStart == fMatchEnd) {
899            // Previous match had zero length.  Move start position up one position
900            //  to avoid sending find() into a loop on zero-length matches.
901            if (startPos >= fActiveLimit) {
902                fMatch = FALSE;
903                fHitEnd = TRUE;
904                return FALSE;
905            }
906            U16_FWD_1(inputBuf, startPos, fInputLength);
907        }
908    } else {
909        if (fLastMatchEnd >= 0) {
910            // A previous find() failed to match.  Don't try again.
911            //   (without this test, a pattern with a zero-length match
912            //    could match again at the end of an input string.)
913            fHitEnd = TRUE;
914            return FALSE;
915        }
916    }
917
918
919    // Compute the position in the input string beyond which a match can not begin, because
920    //   the minimum length match would extend past the end of the input.
921    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
922    //          Be aware of possible overflows if making changes here.
923    int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
924    if (startPos > testLen) {
925        fMatch = FALSE;
926        fHitEnd = TRUE;
927        return FALSE;
928    }
929
930    UChar32  c;
931    U_ASSERT(startPos >= 0);
932
933    switch (fPattern->fStartType) {
934    case START_NO_INFO:
935        // No optimization was found.
936        //  Try a match at each input position.
937        for (;;) {
938            MatchChunkAt(startPos, FALSE, fDeferredStatus);
939            if (U_FAILURE(fDeferredStatus)) {
940                return FALSE;
941            }
942            if (fMatch) {
943                return TRUE;
944            }
945            if (startPos >= testLen) {
946                fHitEnd = TRUE;
947                return FALSE;
948            }
949            U16_FWD_1(inputBuf, startPos, fActiveLimit);
950            // Note that it's perfectly OK for a pattern to have a zero-length
951            //   match at the end of a string, so we must make sure that the loop
952            //   runs with startPos == testLen the last time through.
953            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
954                return FALSE;
955        }
956        U_ASSERT(FALSE);
957
958    case START_START:
959        // Matches are only possible at the start of the input string
960        //   (pattern begins with ^ or \A)
961        if (startPos > fActiveStart) {
962            fMatch = FALSE;
963            return FALSE;
964        }
965        MatchChunkAt(startPos, FALSE, fDeferredStatus);
966        if (U_FAILURE(fDeferredStatus)) {
967            return FALSE;
968        }
969        return fMatch;
970
971
972    case START_SET:
973    {
974        // Match may start on any char from a pre-computed set.
975        U_ASSERT(fPattern->fMinMatchLen > 0);
976        for (;;) {
977            int32_t pos = startPos;
978            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
979            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
980                (c>=256 && fPattern->fInitialChars->contains(c))) {
981                MatchChunkAt(pos, FALSE, fDeferredStatus);
982                if (U_FAILURE(fDeferredStatus)) {
983                    return FALSE;
984                }
985                if (fMatch) {
986                    return TRUE;
987                }
988            }
989            if (pos >= testLen) {
990                fMatch = FALSE;
991                fHitEnd = TRUE;
992                return FALSE;
993            }
994            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
995                return FALSE;
996        }
997    }
998        U_ASSERT(FALSE);
999
1000    case START_STRING:
1001    case START_CHAR:
1002    {
1003        // Match starts on exactly one char.
1004        U_ASSERT(fPattern->fMinMatchLen > 0);
1005        UChar32 theChar = fPattern->fInitialChar;
1006        for (;;) {
1007            int32_t pos = startPos;
1008            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1009            if (c == theChar) {
1010                MatchChunkAt(pos, FALSE, fDeferredStatus);
1011                if (U_FAILURE(fDeferredStatus)) {
1012                    return FALSE;
1013                }
1014                if (fMatch) {
1015                    return TRUE;
1016                }
1017            }
1018            if (pos >= testLen) {
1019                fMatch = FALSE;
1020                fHitEnd = TRUE;
1021                return FALSE;
1022            }
1023            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1024                return FALSE;
1025        }
1026    }
1027        U_ASSERT(FALSE);
1028
1029    case START_LINE:
1030    {
1031        UChar32  c;
1032        if (startPos == fAnchorStart) {
1033            MatchChunkAt(startPos, FALSE, fDeferredStatus);
1034            if (U_FAILURE(fDeferredStatus)) {
1035                return FALSE;
1036            }
1037            if (fMatch) {
1038                return TRUE;
1039            }
1040            U16_FWD_1(inputBuf, startPos, fActiveLimit);
1041        }
1042
1043        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1044            for (;;) {
1045                c = inputBuf[startPos-1];
1046                if (c == 0x0a) {
1047                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1048                    if (U_FAILURE(fDeferredStatus)) {
1049                        return FALSE;
1050                    }
1051                    if (fMatch) {
1052                        return TRUE;
1053                    }
1054                }
1055                if (startPos >= testLen) {
1056                    fMatch = FALSE;
1057                    fHitEnd = TRUE;
1058                    return FALSE;
1059                }
1060                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1061                // Note that it's perfectly OK for a pattern to have a zero-length
1062                //   match at the end of a string, so we must make sure that the loop
1063                //   runs with startPos == testLen the last time through.
1064	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1065                    return FALSE;
1066            }
1067        } else {
1068            for (;;) {
1069                c = inputBuf[startPos-1];
1070                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1071                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1072                    if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1073                        startPos++;
1074                    }
1075                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1076                    if (U_FAILURE(fDeferredStatus)) {
1077                        return FALSE;
1078                    }
1079                    if (fMatch) {
1080                        return TRUE;
1081                    }
1082                }
1083                if (startPos >= testLen) {
1084                    fMatch = FALSE;
1085                    fHitEnd = TRUE;
1086                    return FALSE;
1087                }
1088                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1089                // Note that it's perfectly OK for a pattern to have a zero-length
1090                //   match at the end of a string, so we must make sure that the loop
1091                //   runs with startPos == testLen the last time through.
1092	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1093                    return FALSE;
1094            }
1095        }
1096    }
1097
1098    default:
1099        U_ASSERT(FALSE);
1100    }
1101
1102    U_ASSERT(FALSE);
1103    return FALSE;
1104}
1105
1106
1107
1108//--------------------------------------------------------------------------------
1109//
1110//  group()
1111//
1112//--------------------------------------------------------------------------------
1113UnicodeString RegexMatcher::group(UErrorCode &status) const {
1114    return group(0, status);
1115}
1116
1117//  Return immutable shallow clone
1118UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1119    return group(0, dest, group_len, status);
1120}
1121
1122//  Return immutable shallow clone
1123UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1124    group_len = 0;
1125    UBool bailOut = FALSE;
1126    if (U_FAILURE(status)) {
1127        return dest;
1128    }
1129    if (U_FAILURE(fDeferredStatus)) {
1130        status = fDeferredStatus;
1131        bailOut = TRUE;
1132    }
1133    if (fMatch == FALSE) {
1134        status = U_REGEX_INVALID_STATE;
1135        bailOut = TRUE;
1136    }
1137    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1138        status = U_INDEX_OUTOFBOUNDS_ERROR;
1139        bailOut = TRUE;
1140    }
1141
1142    if (bailOut) {
1143        return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status);
1144    }
1145
1146    int64_t s, e;
1147    if (groupNum == 0) {
1148        s = fMatchStart;
1149        e = fMatchEnd;
1150    } else {
1151        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1152        U_ASSERT(groupOffset < fPattern->fFrameSize);
1153        U_ASSERT(groupOffset >= 0);
1154        s = fFrame->fExtra[groupOffset];
1155        e = fFrame->fExtra[groupOffset+1];
1156    }
1157
1158    if (s < 0) {
1159        // A capture group wasn't part of the match
1160        return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1161    }
1162    U_ASSERT(s <= e);
1163    group_len = e - s;
1164
1165    dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1166    if (dest)
1167        UTEXT_SETNATIVEINDEX(dest, s);
1168    return dest;
1169}
1170
1171UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1172    UnicodeString result;
1173    if (U_FAILURE(status)) {
1174        return result;
1175    }
1176    UText resultText = UTEXT_INITIALIZER;
1177    utext_openUnicodeString(&resultText, &result, &status);
1178    group(groupNum, &resultText, status);
1179    utext_close(&resultText);
1180    return result;
1181}
1182
1183
1184//  Return deep (mutable) clone
1185//		Technology Preview (as an API), but note that the UnicodeString API is implemented
1186//		using this function.
1187UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1188    UBool bailOut = FALSE;
1189    if (U_FAILURE(status)) {
1190        return dest;
1191    }
1192    if (U_FAILURE(fDeferredStatus)) {
1193        status = fDeferredStatus;
1194        bailOut = TRUE;
1195    }
1196
1197    if (fMatch == FALSE) {
1198        status = U_REGEX_INVALID_STATE;
1199        bailOut = TRUE;
1200    }
1201    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1202        status = U_INDEX_OUTOFBOUNDS_ERROR;
1203        bailOut = TRUE;
1204    }
1205
1206    if (bailOut) {
1207        if (dest) {
1208            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1209            return dest;
1210        } else {
1211            return utext_openUChars(NULL, NULL, 0, &status);
1212        }
1213    }
1214
1215    int64_t s, e;
1216    if (groupNum == 0) {
1217        s = fMatchStart;
1218        e = fMatchEnd;
1219    } else {
1220        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1221        U_ASSERT(groupOffset < fPattern->fFrameSize);
1222        U_ASSERT(groupOffset >= 0);
1223        s = fFrame->fExtra[groupOffset];
1224        e = fFrame->fExtra[groupOffset+1];
1225    }
1226
1227    if (s < 0) {
1228        // A capture group wasn't part of the match
1229        if (dest) {
1230            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1231            return dest;
1232        } else {
1233            return utext_openUChars(NULL, NULL, 0, &status);
1234        }
1235    }
1236    U_ASSERT(s <= e);
1237
1238    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1239        U_ASSERT(e <= fInputLength);
1240        if (dest) {
1241            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1242        } else {
1243            UText groupText = UTEXT_INITIALIZER;
1244            utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1245            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1246            utext_close(&groupText);
1247        }
1248    } else {
1249        int32_t len16;
1250        if (UTEXT_USES_U16(fInputText)) {
1251            len16 = (int32_t)(e-s);
1252        } else {
1253            UErrorCode lengthStatus = U_ZERO_ERROR;
1254            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1255        }
1256        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1257        if (groupChars == NULL) {
1258            status = U_MEMORY_ALLOCATION_ERROR;
1259            return dest;
1260        }
1261        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1262
1263        if (dest) {
1264            utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1265        } else {
1266            UText groupText = UTEXT_INITIALIZER;
1267            utext_openUChars(&groupText, groupChars, len16, &status);
1268            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1269            utext_close(&groupText);
1270        }
1271
1272        uprv_free(groupChars);
1273    }
1274    return dest;
1275}
1276
1277//--------------------------------------------------------------------------------
1278//
1279//  appendGroup() -- currently internal only, appends a group to a UText rather
1280//                   than replacing its contents
1281//
1282//--------------------------------------------------------------------------------
1283
1284int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1285    if (U_FAILURE(status)) {
1286        return 0;
1287    }
1288    if (U_FAILURE(fDeferredStatus)) {
1289        status = fDeferredStatus;
1290        return 0;
1291    }
1292    int64_t destLen = utext_nativeLength(dest);
1293
1294    if (fMatch == FALSE) {
1295        status = U_REGEX_INVALID_STATE;
1296        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297    }
1298    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1299        status = U_INDEX_OUTOFBOUNDS_ERROR;
1300        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1301    }
1302
1303    int64_t s, e;
1304    if (groupNum == 0) {
1305        s = fMatchStart;
1306        e = fMatchEnd;
1307    } else {
1308        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1309        U_ASSERT(groupOffset < fPattern->fFrameSize);
1310        U_ASSERT(groupOffset >= 0);
1311        s = fFrame->fExtra[groupOffset];
1312        e = fFrame->fExtra[groupOffset+1];
1313    }
1314
1315    if (s < 0) {
1316        // A capture group wasn't part of the match
1317        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1318    }
1319    U_ASSERT(s <= e);
1320
1321    int64_t deltaLen;
1322    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1323        U_ASSERT(e <= fInputLength);
1324        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1325    } else {
1326        int32_t len16;
1327        if (UTEXT_USES_U16(fInputText)) {
1328            len16 = (int32_t)(e-s);
1329        } else {
1330            UErrorCode lengthStatus = U_ZERO_ERROR;
1331            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1332        }
1333        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1334        if (groupChars == NULL) {
1335            status = U_MEMORY_ALLOCATION_ERROR;
1336            return 0;
1337        }
1338        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1339
1340        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1341        uprv_free(groupChars);
1342    }
1343    return deltaLen;
1344}
1345
1346
1347
1348//--------------------------------------------------------------------------------
1349//
1350//  groupCount()
1351//
1352//--------------------------------------------------------------------------------
1353int32_t RegexMatcher::groupCount() const {
1354    return fPattern->fGroupMap->size();
1355}
1356
1357
1358
1359//--------------------------------------------------------------------------------
1360//
1361//  hasAnchoringBounds()
1362//
1363//--------------------------------------------------------------------------------
1364UBool RegexMatcher::hasAnchoringBounds() const {
1365    return fAnchoringBounds;
1366}
1367
1368
1369//--------------------------------------------------------------------------------
1370//
1371//  hasTransparentBounds()
1372//
1373//--------------------------------------------------------------------------------
1374UBool RegexMatcher::hasTransparentBounds() const {
1375    return fTransparentBounds;
1376}
1377
1378
1379
1380//--------------------------------------------------------------------------------
1381//
1382//  hitEnd()
1383//
1384//--------------------------------------------------------------------------------
1385UBool RegexMatcher::hitEnd() const {
1386    return fHitEnd;
1387}
1388
1389
1390//--------------------------------------------------------------------------------
1391//
1392//  input()
1393//
1394//--------------------------------------------------------------------------------
1395const UnicodeString &RegexMatcher::input() const {
1396    if (!fInput) {
1397        UErrorCode status = U_ZERO_ERROR;
1398        int32_t len16;
1399        if (UTEXT_USES_U16(fInputText)) {
1400            len16 = (int32_t)fInputLength;
1401        } else {
1402            len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1403            status = U_ZERO_ERROR; // overflow, length status
1404        }
1405        UnicodeString *result = new UnicodeString(len16, 0, 0);
1406
1407        UChar *inputChars = result->getBuffer(len16);
1408        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1409        result->releaseBuffer(len16);
1410
1411        (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1412    }
1413
1414    return *fInput;
1415}
1416
1417//--------------------------------------------------------------------------------
1418//
1419//  inputText()
1420//
1421//--------------------------------------------------------------------------------
1422UText *RegexMatcher::inputText() const {
1423    return fInputText;
1424}
1425
1426
1427//--------------------------------------------------------------------------------
1428//
1429//  getInput() -- like inputText(), but makes a clone or copies into another UText
1430//
1431//--------------------------------------------------------------------------------
1432UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1433    UBool bailOut = FALSE;
1434    if (U_FAILURE(status)) {
1435        return dest;
1436    }
1437    if (U_FAILURE(fDeferredStatus)) {
1438        status = fDeferredStatus;
1439        bailOut = TRUE;
1440    }
1441
1442    if (bailOut) {
1443        if (dest) {
1444            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1445            return dest;
1446        } else {
1447            return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1448        }
1449    }
1450
1451    if (dest) {
1452        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1453            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1454        } else {
1455            int32_t input16Len;
1456            if (UTEXT_USES_U16(fInputText)) {
1457                input16Len = (int32_t)fInputLength;
1458            } else {
1459                UErrorCode lengthStatus = U_ZERO_ERROR;
1460                input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1461            }
1462            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1463            if (inputChars == NULL) {
1464                return dest;
1465            }
1466
1467            status = U_ZERO_ERROR;
1468            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1469            status = U_ZERO_ERROR;
1470            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1471
1472            uprv_free(inputChars);
1473        }
1474        return dest;
1475    } else {
1476        return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1477    }
1478}
1479
1480
1481static UBool compat_SyncMutableUTextContents(UText *ut);
1482static UBool compat_SyncMutableUTextContents(UText *ut) {
1483    UBool retVal = FALSE;
1484
1485    //  In the following test, we're really only interested in whether the UText should switch
1486    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1487    //  will still point to the correct data.
1488    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1489        UnicodeString *us=(UnicodeString *)ut->context;
1490
1491        // Update to the latest length.
1492        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1493        int32_t newLength = us->length();
1494
1495        // Update the chunk description.
1496        // The buffer may have switched between stack- and heap-based.
1497        ut->chunkContents    = us->getBuffer();
1498        ut->chunkLength      = newLength;
1499        ut->chunkNativeLimit = newLength;
1500        ut->nativeIndexingLimit = newLength;
1501        retVal = TRUE;
1502    }
1503
1504    return retVal;
1505}
1506
1507//--------------------------------------------------------------------------------
1508//
1509//  lookingAt()
1510//
1511//--------------------------------------------------------------------------------
1512UBool RegexMatcher::lookingAt(UErrorCode &status) {
1513    if (U_FAILURE(status)) {
1514        return FALSE;
1515    }
1516    if (U_FAILURE(fDeferredStatus)) {
1517        status = fDeferredStatus;
1518        return FALSE;
1519    }
1520
1521    if (fInputUniStrMaybeMutable) {
1522        if (compat_SyncMutableUTextContents(fInputText)) {
1523        fInputLength = utext_nativeLength(fInputText);
1524        reset();
1525        }
1526    }
1527    else {
1528        resetPreserveRegion();
1529    }
1530    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1531        MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1532    } else {
1533        MatchAt(fActiveStart, FALSE, status);
1534    }
1535    return fMatch;
1536}
1537
1538
1539UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1540    if (U_FAILURE(status)) {
1541        return FALSE;
1542    }
1543    if (U_FAILURE(fDeferredStatus)) {
1544        status = fDeferredStatus;
1545        return FALSE;
1546    }
1547    reset();
1548
1549    if (start < 0) {
1550        status = U_INDEX_OUTOFBOUNDS_ERROR;
1551        return FALSE;
1552    }
1553
1554    if (fInputUniStrMaybeMutable) {
1555        if (compat_SyncMutableUTextContents(fInputText)) {
1556        fInputLength = utext_nativeLength(fInputText);
1557        reset();
1558        }
1559    }
1560
1561    int64_t nativeStart;
1562    nativeStart = start;
1563    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1564        status = U_INDEX_OUTOFBOUNDS_ERROR;
1565        return FALSE;
1566    }
1567
1568    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1569        MatchChunkAt((int32_t)nativeStart, FALSE, status);
1570    } else {
1571        MatchAt(nativeStart, FALSE, status);
1572    }
1573    return fMatch;
1574}
1575
1576
1577
1578//--------------------------------------------------------------------------------
1579//
1580//  matches()
1581//
1582//--------------------------------------------------------------------------------
1583UBool RegexMatcher::matches(UErrorCode &status) {
1584    if (U_FAILURE(status)) {
1585        return FALSE;
1586    }
1587    if (U_FAILURE(fDeferredStatus)) {
1588        status = fDeferredStatus;
1589        return FALSE;
1590    }
1591
1592    if (fInputUniStrMaybeMutable) {
1593        if (compat_SyncMutableUTextContents(fInputText)) {
1594        fInputLength = utext_nativeLength(fInputText);
1595        reset();
1596        }
1597    }
1598    else {
1599        resetPreserveRegion();
1600    }
1601
1602    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1603        MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1604    } else {
1605        MatchAt(fActiveStart, TRUE, status);
1606    }
1607    return fMatch;
1608}
1609
1610
1611UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1612    if (U_FAILURE(status)) {
1613        return FALSE;
1614    }
1615    if (U_FAILURE(fDeferredStatus)) {
1616        status = fDeferredStatus;
1617        return FALSE;
1618    }
1619    reset();
1620
1621    if (start < 0) {
1622        status = U_INDEX_OUTOFBOUNDS_ERROR;
1623        return FALSE;
1624    }
1625
1626    if (fInputUniStrMaybeMutable) {
1627        if (compat_SyncMutableUTextContents(fInputText)) {
1628        fInputLength = utext_nativeLength(fInputText);
1629        reset();
1630        }
1631    }
1632
1633    int64_t nativeStart;
1634    nativeStart = start;
1635    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1636        status = U_INDEX_OUTOFBOUNDS_ERROR;
1637        return FALSE;
1638    }
1639
1640    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1641        MatchChunkAt((int32_t)nativeStart, TRUE, status);
1642    } else {
1643        MatchAt(nativeStart, TRUE, status);
1644    }
1645    return fMatch;
1646}
1647
1648
1649
1650//--------------------------------------------------------------------------------
1651//
1652//    pattern
1653//
1654//--------------------------------------------------------------------------------
1655const RegexPattern &RegexMatcher::pattern() const {
1656    return *fPattern;
1657}
1658
1659
1660
1661//--------------------------------------------------------------------------------
1662//
1663//    region
1664//
1665//--------------------------------------------------------------------------------
1666RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1667    if (U_FAILURE(status)) {
1668        return *this;
1669    }
1670
1671    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1672        status = U_ILLEGAL_ARGUMENT_ERROR;
1673    }
1674
1675    int64_t nativeStart = regionStart;
1676    int64_t nativeLimit = regionLimit;
1677    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1678      status = U_ILLEGAL_ARGUMENT_ERROR;
1679    }
1680
1681    if (startIndex == -1)
1682      this->reset();
1683    else
1684      resetPreserveRegion();
1685
1686    fRegionStart = nativeStart;
1687    fRegionLimit = nativeLimit;
1688    fActiveStart = nativeStart;
1689    fActiveLimit = nativeLimit;
1690
1691    if (startIndex != -1) {
1692      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1693          status = U_INDEX_OUTOFBOUNDS_ERROR;
1694      }
1695      fMatchEnd = startIndex;
1696    }
1697
1698    if (!fTransparentBounds) {
1699        fLookStart = nativeStart;
1700        fLookLimit = nativeLimit;
1701    }
1702    if (fAnchoringBounds) {
1703        fAnchorStart = nativeStart;
1704        fAnchorLimit = nativeLimit;
1705    }
1706    return *this;
1707}
1708
1709RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1710  return region(start, limit, -1, status);
1711}
1712
1713//--------------------------------------------------------------------------------
1714//
1715//    regionEnd
1716//
1717//--------------------------------------------------------------------------------
1718int32_t RegexMatcher::regionEnd() const {
1719    return (int32_t)fRegionLimit;
1720}
1721
1722int64_t RegexMatcher::regionEnd64() const {
1723    return fRegionLimit;
1724}
1725
1726//--------------------------------------------------------------------------------
1727//
1728//    regionStart
1729//
1730//--------------------------------------------------------------------------------
1731int32_t RegexMatcher::regionStart() const {
1732    return (int32_t)fRegionStart;
1733}
1734
1735int64_t RegexMatcher::regionStart64() const {
1736    return fRegionStart;
1737}
1738
1739
1740//--------------------------------------------------------------------------------
1741//
1742//    replaceAll
1743//
1744//--------------------------------------------------------------------------------
1745UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1746    UText replacementText = UTEXT_INITIALIZER;
1747    UText resultText = UTEXT_INITIALIZER;
1748    UnicodeString resultString;
1749    if (U_FAILURE(status)) {
1750        return resultString;
1751    }
1752
1753    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754    utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756    replaceAll(&replacementText, &resultText, status);
1757
1758    utext_close(&resultText);
1759    utext_close(&replacementText);
1760
1761    return resultString;
1762}
1763
1764
1765//
1766//    replaceAll, UText mode
1767//
1768UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1769    if (U_FAILURE(status)) {
1770        return dest;
1771    }
1772    if (U_FAILURE(fDeferredStatus)) {
1773        status = fDeferredStatus;
1774        return dest;
1775    }
1776
1777    if (dest == NULL) {
1778        UnicodeString emptyString;
1779        UText empty = UTEXT_INITIALIZER;
1780
1781        utext_openUnicodeString(&empty, &emptyString, &status);
1782        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783        utext_close(&empty);
1784    }
1785
1786    if (U_SUCCESS(status)) {
1787        reset();
1788        while (find()) {
1789            appendReplacement(dest, replacement, status);
1790            if (U_FAILURE(status)) {
1791                break;
1792            }
1793        }
1794        appendTail(dest, status);
1795    }
1796
1797    return dest;
1798}
1799
1800
1801//--------------------------------------------------------------------------------
1802//
1803//    replaceFirst
1804//
1805//--------------------------------------------------------------------------------
1806UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1807    UText replacementText = UTEXT_INITIALIZER;
1808    UText resultText = UTEXT_INITIALIZER;
1809    UnicodeString resultString;
1810
1811    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1812    utext_openUnicodeString(&resultText, &resultString, &status);
1813
1814    replaceFirst(&replacementText, &resultText, status);
1815
1816    utext_close(&resultText);
1817    utext_close(&replacementText);
1818
1819    return resultString;
1820}
1821
1822//
1823//    replaceFirst, UText mode
1824//
1825UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1826    if (U_FAILURE(status)) {
1827        return dest;
1828    }
1829    if (U_FAILURE(fDeferredStatus)) {
1830        status = fDeferredStatus;
1831        return dest;
1832    }
1833
1834    reset();
1835    if (!find()) {
1836        return getInput(dest, status);
1837    }
1838
1839    if (dest == NULL) {
1840        UnicodeString emptyString;
1841        UText empty = UTEXT_INITIALIZER;
1842
1843        utext_openUnicodeString(&empty, &emptyString, &status);
1844        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1845        utext_close(&empty);
1846    }
1847
1848    appendReplacement(dest, replacement, status);
1849    appendTail(dest, status);
1850
1851    return dest;
1852}
1853
1854
1855//--------------------------------------------------------------------------------
1856//
1857//     requireEnd
1858//
1859//--------------------------------------------------------------------------------
1860UBool RegexMatcher::requireEnd() const {
1861    return fRequireEnd;
1862}
1863
1864
1865//--------------------------------------------------------------------------------
1866//
1867//     reset
1868//
1869//--------------------------------------------------------------------------------
1870RegexMatcher &RegexMatcher::reset() {
1871    fRegionStart    = 0;
1872    fRegionLimit    = fInputLength;
1873    fActiveStart    = 0;
1874    fActiveLimit    = fInputLength;
1875    fAnchorStart    = 0;
1876    fAnchorLimit    = fInputLength;
1877    fLookStart      = 0;
1878    fLookLimit      = fInputLength;
1879    resetPreserveRegion();
1880    return *this;
1881}
1882
1883
1884
1885void RegexMatcher::resetPreserveRegion() {
1886    fMatchStart     = 0;
1887    fMatchEnd       = 0;
1888    fLastMatchEnd   = -1;
1889    fAppendPosition = 0;
1890    fMatch          = FALSE;
1891    fHitEnd         = FALSE;
1892    fRequireEnd     = FALSE;
1893    fTime           = 0;
1894    fTickCounter    = TIMER_INITIAL_VALUE;
1895    //resetStack(); // more expensive than it looks...
1896}
1897
1898
1899RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1900    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1901    if (fPattern->fNeedsAltInput) {
1902        fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1903    }
1904    fInputLength = utext_nativeLength(fInputText);
1905
1906    reset();
1907    delete fInput;
1908    fInput = NULL;
1909
1910    //  Do the following for any UnicodeString.
1911    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1912    fInputUniStrMaybeMutable = TRUE;
1913
1914    if (fWordBreakItr != NULL) {
1915#if UCONFIG_NO_BREAK_ITERATION==0
1916        UErrorCode status = U_ZERO_ERROR;
1917        fWordBreakItr->setText(fInputText, status);
1918#endif
1919    }
1920    return *this;
1921}
1922
1923
1924RegexMatcher &RegexMatcher::reset(UText *input) {
1925    if (fInputText != input) {
1926        fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1927        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1928        fInputLength = utext_nativeLength(fInputText);
1929
1930        delete fInput;
1931        fInput = NULL;
1932
1933        if (fWordBreakItr != NULL) {
1934#if UCONFIG_NO_BREAK_ITERATION==0
1935            UErrorCode status = U_ZERO_ERROR;
1936            fWordBreakItr->setText(input, status);
1937#endif
1938        }
1939    }
1940    reset();
1941    fInputUniStrMaybeMutable = FALSE;
1942
1943    return *this;
1944}
1945
1946/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1947    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1948    return *this;
1949}*/
1950
1951RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1952    if (U_FAILURE(status)) {
1953        return *this;
1954    }
1955    reset();       // Reset also resets the region to be the entire string.
1956
1957    if (position < 0 || position > fActiveLimit) {
1958        status = U_INDEX_OUTOFBOUNDS_ERROR;
1959        return *this;
1960    }
1961    fMatchEnd = position;
1962    return *this;
1963}
1964
1965
1966// BEGIN android-added
1967// Removed this function after Android upgrad to ICU4.8.
1968//--------------------------------------------------------------------------------
1969//
1970//    refresh
1971//
1972//--------------------------------------------------------------------------------
1973RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1974    if (U_FAILURE(status)) {
1975        return *this;
1976    }
1977    if (input == NULL) {
1978        status = U_ILLEGAL_ARGUMENT_ERROR;
1979        return *this;
1980    }
1981    if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1982        status = U_ILLEGAL_ARGUMENT_ERROR;
1983        return *this;
1984    }
1985    int64_t  pos = utext_getNativeIndex(fInputText);
1986    //  Shallow read-only clone of the new UText into the existing input UText
1987    fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1988    if (U_FAILURE(status)) {
1989        return *this;
1990    }
1991    utext_setNativeIndex(fInputText, pos);
1992
1993    if (fAltInputText != NULL) {
1994        pos = utext_getNativeIndex(fAltInputText);
1995        fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1996        if (U_FAILURE(status)) {
1997            return *this;
1998        }
1999        utext_setNativeIndex(fAltInputText, pos);
2000    }
2001    return *this;
2002}
2003// END android-added
2004
2005
2006//--------------------------------------------------------------------------------
2007//
2008//    setTrace
2009//
2010//--------------------------------------------------------------------------------
2011void RegexMatcher::setTrace(UBool state) {
2012    fTraceDebug = state;
2013}
2014
2015
2016
2017//---------------------------------------------------------------------
2018//
2019//   split
2020//
2021//---------------------------------------------------------------------
2022int32_t  RegexMatcher::split(const UnicodeString &input,
2023        UnicodeString    dest[],
2024        int32_t          destCapacity,
2025        UErrorCode      &status)
2026{
2027    UText inputText = UTEXT_INITIALIZER;
2028    utext_openConstUnicodeString(&inputText, &input, &status);
2029    if (U_FAILURE(status)) {
2030        return 0;
2031    }
2032
2033    UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2034    if (destText == NULL) {
2035        status = U_MEMORY_ALLOCATION_ERROR;
2036        return 0;
2037    }
2038    int32_t i;
2039    for (i = 0; i < destCapacity; i++) {
2040        destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2041    }
2042
2043    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2044
2045    for (i = 0; i < destCapacity; i++) {
2046        utext_close(destText[i]);
2047    }
2048
2049    uprv_free(destText);
2050    utext_close(&inputText);
2051    return fieldCount;
2052}
2053
2054//
2055//   split, UText mode
2056//
2057int32_t  RegexMatcher::split(UText *input,
2058        UText           *dest[],
2059        int32_t          destCapacity,
2060        UErrorCode      &status)
2061{
2062    //
2063    // Check arguements for validity
2064    //
2065    if (U_FAILURE(status)) {
2066        return 0;
2067    };
2068
2069    if (destCapacity < 1) {
2070        status = U_ILLEGAL_ARGUMENT_ERROR;
2071        return 0;
2072    }
2073
2074    //
2075    // Reset for the input text
2076    //
2077    reset(input);
2078    int64_t   nextOutputStringStart = 0;
2079    if (fActiveLimit == 0) {
2080        return 0;
2081    }
2082
2083    //
2084    // Loop through the input text, searching for the delimiter pattern
2085    //
2086    int32_t i;
2087    int32_t numCaptureGroups = fPattern->fGroupMap->size();
2088    for (i=0; ; i++) {
2089        if (i>=destCapacity-1) {
2090            // There is one or zero output string left.
2091            // Fill the last output string with whatever is left from the input, then exit the loop.
2092            //  ( i will be == destCapacity if we filled the output array while processing
2093            //    capture groups of the delimiter expression, in which case we will discard the
2094            //    last capture group saved in favor of the unprocessed remainder of the
2095            //    input string.)
2096            i = destCapacity-1;
2097            if (fActiveLimit > nextOutputStringStart) {
2098                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2099                    if (dest[i]) {
2100                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2101                                      input->chunkContents+nextOutputStringStart,
2102                                      (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2103                    } else {
2104                        UText remainingText = UTEXT_INITIALIZER;
2105                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2106                                         fActiveLimit-nextOutputStringStart, &status);
2107                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2108                        utext_close(&remainingText);
2109                    }
2110                } else {
2111                    UErrorCode lengthStatus = U_ZERO_ERROR;
2112                    int32_t remaining16Length =
2113                        utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2114                    UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2115                    if (remainingChars == NULL) {
2116                        status = U_MEMORY_ALLOCATION_ERROR;
2117                        break;
2118                    }
2119
2120                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2121                    if (dest[i]) {
2122                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2123                    } else {
2124                        UText remainingText = UTEXT_INITIALIZER;
2125                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2126                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2127                        utext_close(&remainingText);
2128                    }
2129
2130                    uprv_free(remainingChars);
2131                }
2132            }
2133            break;
2134        }
2135        if (find()) {
2136            // We found another delimiter.  Move everything from where we started looking
2137            //  up until the start of the delimiter into the next output string.
2138            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2139                if (dest[i]) {
2140                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2141                                  input->chunkContents+nextOutputStringStart,
2142                                  (int32_t)(fMatchStart-nextOutputStringStart), &status);
2143                } else {
2144                    UText remainingText = UTEXT_INITIALIZER;
2145                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2146                                      fMatchStart-nextOutputStringStart, &status);
2147                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2148                    utext_close(&remainingText);
2149                }
2150            } else {
2151                UErrorCode lengthStatus = U_ZERO_ERROR;
2152                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2153                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2154                if (remainingChars == NULL) {
2155                    status = U_MEMORY_ALLOCATION_ERROR;
2156                    break;
2157                }
2158                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2159                if (dest[i]) {
2160                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2161                } else {
2162                    UText remainingText = UTEXT_INITIALIZER;
2163                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2164                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2165                    utext_close(&remainingText);
2166                }
2167
2168                uprv_free(remainingChars);
2169            }
2170            nextOutputStringStart = fMatchEnd;
2171
2172            // If the delimiter pattern has capturing parentheses, the captured
2173            //  text goes out into the next n destination strings.
2174            int32_t groupNum;
2175            UBool lastGroupWasNullUText = FALSE;
2176            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2177                if (i==destCapacity-1) {
2178                    break;
2179                }
2180                i++;
2181                lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE);
2182                dest[i] = group(groupNum, dest[i], status);
2183            }
2184
2185            if (nextOutputStringStart == fActiveLimit) {
2186                // The delimiter was at the end of the string.  We're done.
2187                break;
2188            } else if (i == destCapacity-1) {
2189                // We're out of capture groups, and the rest of the string is more important
2190                if (lastGroupWasNullUText) {
2191                    utext_close(dest[i]);
2192                    dest[i] = NULL;
2193                }
2194            }
2195
2196        }
2197        else
2198        {
2199            // We ran off the end of the input while looking for the next delimiter.
2200            // All the remaining text goes into the current output string.
2201            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2202                if (dest[i]) {
2203                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2204                                  input->chunkContents+nextOutputStringStart,
2205                                  (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2206                } else {
2207                    UText remainingText = UTEXT_INITIALIZER;
2208                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2209                                     fActiveLimit-nextOutputStringStart, &status);
2210                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2211                    utext_close(&remainingText);
2212                }
2213            } else {
2214                UErrorCode lengthStatus = U_ZERO_ERROR;
2215                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2216                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2217                if (remainingChars == NULL) {
2218                    status = U_MEMORY_ALLOCATION_ERROR;
2219                    break;
2220                }
2221
2222                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2223                if (dest[i]) {
2224                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2225                } else {
2226                    UText remainingText = UTEXT_INITIALIZER;
2227                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2228                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2229                    utext_close(&remainingText);
2230                }
2231
2232                uprv_free(remainingChars);
2233            }
2234            break;
2235        }
2236        if (U_FAILURE(status)) {
2237            break;
2238        }
2239    }   // end of for loop
2240    return i+1;
2241}
2242
2243
2244//--------------------------------------------------------------------------------
2245//
2246//     start
2247//
2248//--------------------------------------------------------------------------------
2249int32_t RegexMatcher::start(UErrorCode &status) const {
2250    return start(0, status);
2251}
2252
2253int64_t RegexMatcher::start64(UErrorCode &status) const {
2254    return start64(0, status);
2255}
2256
2257//--------------------------------------------------------------------------------
2258//
2259//     start(int32_t group, UErrorCode &status)
2260//
2261//--------------------------------------------------------------------------------
2262
2263int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2264    if (U_FAILURE(status)) {
2265        return -1;
2266    }
2267    if (U_FAILURE(fDeferredStatus)) {
2268        status = fDeferredStatus;
2269        return -1;
2270    }
2271    if (fMatch == FALSE) {
2272        status = U_REGEX_INVALID_STATE;
2273        return -1;
2274    }
2275    if (group < 0 || group > fPattern->fGroupMap->size()) {
2276        status = U_INDEX_OUTOFBOUNDS_ERROR;
2277        return -1;
2278    }
2279    int64_t s;
2280    if (group == 0) {
2281        s = fMatchStart;
2282    } else {
2283        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2284        U_ASSERT(groupOffset < fPattern->fFrameSize);
2285        U_ASSERT(groupOffset >= 0);
2286        s = fFrame->fExtra[groupOffset];
2287    }
2288
2289    return s;
2290}
2291
2292
2293int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2294    return (int32_t)start64(group, status);
2295}
2296
2297//--------------------------------------------------------------------------------
2298//
2299//     useAnchoringBounds
2300//
2301//--------------------------------------------------------------------------------
2302RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2303    fAnchoringBounds = b;
2304    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2305    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2306    return *this;
2307}
2308
2309
2310//--------------------------------------------------------------------------------
2311//
2312//     useTransparentBounds
2313//
2314//--------------------------------------------------------------------------------
2315RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2316    fTransparentBounds = b;
2317    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2318    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2319    return *this;
2320}
2321
2322//--------------------------------------------------------------------------------
2323//
2324//     setTimeLimit
2325//
2326//--------------------------------------------------------------------------------
2327void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2328    if (U_FAILURE(status)) {
2329        return;
2330    }
2331    if (U_FAILURE(fDeferredStatus)) {
2332        status = fDeferredStatus;
2333        return;
2334    }
2335    if (limit < 0) {
2336        status = U_ILLEGAL_ARGUMENT_ERROR;
2337        return;
2338    }
2339    fTimeLimit = limit;
2340}
2341
2342
2343//--------------------------------------------------------------------------------
2344//
2345//     getTimeLimit
2346//
2347//--------------------------------------------------------------------------------
2348int32_t RegexMatcher::getTimeLimit() const {
2349    return fTimeLimit;
2350}
2351
2352
2353//--------------------------------------------------------------------------------
2354//
2355//     setStackLimit
2356//
2357//--------------------------------------------------------------------------------
2358void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2359    if (U_FAILURE(status)) {
2360        return;
2361    }
2362    if (U_FAILURE(fDeferredStatus)) {
2363        status = fDeferredStatus;
2364        return;
2365    }
2366    if (limit < 0) {
2367        status = U_ILLEGAL_ARGUMENT_ERROR;
2368        return;
2369    }
2370
2371    // Reset the matcher.  This is needed here in case there is a current match
2372    //    whose final stack frame (containing the match results, pointed to by fFrame)
2373    //    would be lost by resizing to a smaller stack size.
2374    reset();
2375
2376    if (limit == 0) {
2377        // Unlimited stack expansion
2378        fStack->setMaxCapacity(0);
2379    } else {
2380        // Change the units of the limit  from bytes to ints, and bump the size up
2381        //   to be big enough to hold at least one stack frame for the pattern,
2382        //   if it isn't there already.
2383        int32_t adjustedLimit = limit / sizeof(int32_t);
2384        if (adjustedLimit < fPattern->fFrameSize) {
2385            adjustedLimit = fPattern->fFrameSize;
2386        }
2387        fStack->setMaxCapacity(adjustedLimit);
2388    }
2389    fStackLimit = limit;
2390}
2391
2392
2393//--------------------------------------------------------------------------------
2394//
2395//     getStackLimit
2396//
2397//--------------------------------------------------------------------------------
2398int32_t RegexMatcher::getStackLimit() const {
2399    return fStackLimit;
2400}
2401
2402
2403//--------------------------------------------------------------------------------
2404//
2405//     setMatchCallback
2406//
2407//--------------------------------------------------------------------------------
2408void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2409                                    const void              *context,
2410                                    UErrorCode              &status) {
2411    if (U_FAILURE(status)) {
2412        return;
2413    }
2414    fCallbackFn = callback;
2415    fCallbackContext = context;
2416}
2417
2418
2419//--------------------------------------------------------------------------------
2420//
2421//     getMatchCallback
2422//
2423//--------------------------------------------------------------------------------
2424void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2425                                  const void              *&context,
2426                                  UErrorCode              &status) {
2427    if (U_FAILURE(status)) {
2428       return;
2429    }
2430    callback = fCallbackFn;
2431    context  = fCallbackContext;
2432}
2433
2434
2435//--------------------------------------------------------------------------------
2436//
2437//     setMatchCallback
2438//
2439//--------------------------------------------------------------------------------
2440void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2441                                                const void                      *context,
2442                                                UErrorCode                      &status) {
2443    if (U_FAILURE(status)) {
2444        return;
2445    }
2446    fFindProgressCallbackFn = callback;
2447    fFindProgressCallbackContext = context;
2448}
2449
2450
2451//--------------------------------------------------------------------------------
2452//
2453//     getMatchCallback
2454//
2455//--------------------------------------------------------------------------------
2456void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2457                                                const void                    *&context,
2458                                                UErrorCode                    &status) {
2459    if (U_FAILURE(status)) {
2460       return;
2461    }
2462    callback = fFindProgressCallbackFn;
2463    context  = fFindProgressCallbackContext;
2464}
2465
2466
2467//================================================================================
2468//
2469//    Code following this point in this file is the internal
2470//    Match Engine Implementation.
2471//
2472//================================================================================
2473
2474
2475//--------------------------------------------------------------------------------
2476//
2477//   resetStack
2478//           Discard any previous contents of the state save stack, and initialize a
2479//           new stack frame to all -1.  The -1s are needed for capture group limits,
2480//           where they indicate that a group has not yet matched anything.
2481//--------------------------------------------------------------------------------
2482REStackFrame *RegexMatcher::resetStack() {
2483    // Discard any previous contents of the state save stack, and initialize a
2484    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2485    //  where they indicate that a group has not yet matched anything.
2486    fStack->removeAllElements();
2487
2488    REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2489    int32_t i;
2490    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2491        iFrame->fExtra[i] = -1;
2492    }
2493    return iFrame;
2494}
2495
2496
2497
2498//--------------------------------------------------------------------------------
2499//
2500//   isWordBoundary
2501//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2502//                     For us,
2503//                       If the current char is a combining mark,
2504//                          \b is FALSE.
2505//                       Else Scan backwards to the first non-combining char.
2506//                            We are at a boundary if the this char and the original chars are
2507//                               opposite in membership in \w set
2508//
2509//          parameters:   pos   - the current position in the input buffer
2510//
2511//              TODO:  double-check edge cases at region boundaries.
2512//
2513//--------------------------------------------------------------------------------
2514UBool RegexMatcher::isWordBoundary(int64_t pos) {
2515    UBool isBoundary = FALSE;
2516    UBool cIsWord    = FALSE;
2517
2518    if (pos >= fLookLimit) {
2519        fHitEnd = TRUE;
2520    } else {
2521        // Determine whether char c at current position is a member of the word set of chars.
2522        // If we're off the end of the string, behave as though we're not at a word char.
2523        UTEXT_SETNATIVEINDEX(fInputText, pos);
2524        UChar32  c = UTEXT_CURRENT32(fInputText);
2525        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2526            // Current char is a combining one.  Not a boundary.
2527            return FALSE;
2528        }
2529        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2530    }
2531
2532    // Back up until we come to a non-combining char, determine whether
2533    //  that char is a word char.
2534    UBool prevCIsWord = FALSE;
2535    for (;;) {
2536        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2537            break;
2538        }
2539        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2540        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2541              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2542            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2543            break;
2544        }
2545    }
2546    isBoundary = cIsWord ^ prevCIsWord;
2547    return isBoundary;
2548}
2549
2550UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2551    UBool isBoundary = FALSE;
2552    UBool cIsWord    = FALSE;
2553
2554    const UChar *inputBuf = fInputText->chunkContents;
2555
2556    if (pos >= fLookLimit) {
2557        fHitEnd = TRUE;
2558    } else {
2559        // Determine whether char c at current position is a member of the word set of chars.
2560        // If we're off the end of the string, behave as though we're not at a word char.
2561        UChar32 c;
2562        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2563        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2564            // Current char is a combining one.  Not a boundary.
2565            return FALSE;
2566        }
2567        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2568    }
2569
2570    // Back up until we come to a non-combining char, determine whether
2571    //  that char is a word char.
2572    UBool prevCIsWord = FALSE;
2573    for (;;) {
2574        if (pos <= fLookStart) {
2575            break;
2576        }
2577        UChar32 prevChar;
2578        U16_PREV(inputBuf, fLookStart, pos, prevChar);
2579        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2580              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2581            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2582            break;
2583        }
2584    }
2585    isBoundary = cIsWord ^ prevCIsWord;
2586    return isBoundary;
2587}
2588
2589//--------------------------------------------------------------------------------
2590//
2591//   isUWordBoundary
2592//
2593//         Test for a word boundary using RBBI word break.
2594//
2595//          parameters:   pos   - the current position in the input buffer
2596//
2597//--------------------------------------------------------------------------------
2598UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2599    UBool       returnVal = FALSE;
2600#if UCONFIG_NO_BREAK_ITERATION==0
2601
2602    // If we haven't yet created a break iterator for this matcher, do it now.
2603    if (fWordBreakItr == NULL) {
2604        fWordBreakItr =
2605            (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2606        if (U_FAILURE(fDeferredStatus)) {
2607            return FALSE;
2608        }
2609        fWordBreakItr->setText(fInputText, fDeferredStatus);
2610    }
2611
2612    if (pos >= fLookLimit) {
2613        fHitEnd = TRUE;
2614        returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2615                            //    words are not boundaries.  All non-word chars stand by themselves,
2616                            //    with word boundaries on both sides.
2617    } else {
2618        if (!UTEXT_USES_U16(fInputText)) {
2619            // !!!: Would like a better way to do this!
2620            UErrorCode status = U_ZERO_ERROR;
2621            pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2622        }
2623        returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2624    }
2625#endif
2626    return   returnVal;
2627}
2628
2629//--------------------------------------------------------------------------------
2630//
2631//   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2632//                     saves. Increment the "time" counter, and call the
2633//                     user callback function if there is one installed.
2634//
2635//                     If the match operation needs to be aborted, either for a time-out
2636//                     or because the user callback asked for it, just set an error status.
2637//                     The engine will pick that up and stop in its outer loop.
2638//
2639//--------------------------------------------------------------------------------
2640void RegexMatcher::IncrementTime(UErrorCode &status) {
2641    fTickCounter = TIMER_INITIAL_VALUE;
2642    fTime++;
2643    if (fCallbackFn != NULL) {
2644        if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2645            status = U_REGEX_STOPPED_BY_CALLER;
2646            return;
2647        }
2648    }
2649    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2650        status = U_REGEX_TIME_OUT;
2651    }
2652}
2653
2654//--------------------------------------------------------------------------------
2655//
2656//   ReportFindProgress     This function is called once for each advance in the target
2657//                          string from the find() function, and calls the user progress callback
2658//                          function if there is one installed.
2659//
2660//                          NOTE:
2661//
2662//                          If the match operation needs to be aborted because the user
2663//                          callback asked for it, just set an error status.
2664//                          The engine will pick that up and stop in its outer loop.
2665//
2666//--------------------------------------------------------------------------------
2667UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2668    if (fFindProgressCallbackFn != NULL) {
2669        if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2670            status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2671            return FALSE;
2672        }
2673    }
2674    return TRUE;
2675}
2676
2677//--------------------------------------------------------------------------------
2678//
2679//   StateSave
2680//       Make a new stack frame, initialized as a copy of the current stack frame.
2681//       Set the pattern index in the original stack frame from the operand value
2682//       in the opcode.  Execution of the engine continues with the state in
2683//       the newly created stack frame
2684//
2685//       Note that reserveBlock() may grow the stack, resulting in the
2686//       whole thing being relocated in memory.
2687//
2688//    Parameters:
2689//       fp           The top frame pointer when called.  At return, a new
2690//                    fame will be present
2691//       savePatIdx   An index into the compiled pattern.  Goes into the original
2692//                    (not new) frame.  If execution ever back-tracks out of the
2693//                    new frame, this will be where we continue from in the pattern.
2694//    Return
2695//                    The new frame pointer.
2696//
2697//--------------------------------------------------------------------------------
2698inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2699    // push storage for a new frame.
2700    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2701    if (newFP == NULL) {
2702        // Failure on attempted stack expansion.
2703        //   Stack function set some other error code, change it to a more
2704        //   specific one for regular expressions.
2705        status = U_REGEX_STACK_OVERFLOW;
2706        // We need to return a writable stack frame, so just return the
2707        //    previous frame.  The match operation will stop quickly
2708        //    because of the error status, after which the frame will never
2709        //    be looked at again.
2710        return fp;
2711    }
2712    fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2713
2714    // New stack frame = copy of old top frame.
2715    int64_t *source = (int64_t *)fp;
2716    int64_t *dest   = newFP;
2717    for (;;) {
2718        *dest++ = *source++;
2719        if (source == newFP) {
2720            break;
2721        }
2722    }
2723
2724    fTickCounter--;
2725    if (fTickCounter <= 0) {
2726       IncrementTime(status);    // Re-initializes fTickCounter
2727    }
2728    fp->fPatIdx = savePatIdx;
2729    return (REStackFrame *)newFP;
2730}
2731
2732
2733//--------------------------------------------------------------------------------
2734//
2735//   MatchAt      This is the actual matching engine.
2736//
2737//                  startIdx:    begin matching a this index.
2738//                  toEnd:       if true, match must extend to end of the input region
2739//
2740//--------------------------------------------------------------------------------
2741void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2742    UBool       isMatch  = FALSE;      // True if the we have a match.
2743
2744    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2745
2746    int32_t     op;                    // Operation from the compiled pattern, split into
2747    int32_t     opType;                //    the opcode
2748    int32_t     opValue;               //    and the operand value.
2749
2750    #ifdef REGEX_RUN_DEBUG
2751    if (fTraceDebug)
2752    {
2753        printf("MatchAt(startIdx=%ld)\n", startIdx);
2754        printf("Original Pattern: ");
2755        UChar32 c = utext_next32From(fPattern->fPattern, 0);
2756        while (c != U_SENTINEL) {
2757            if (c<32 || c>256) {
2758                c = '.';
2759            }
2760            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
2761
2762            c = UTEXT_NEXT32(fPattern->fPattern);
2763        }
2764        printf("\n");
2765        printf("Input String: ");
2766        c = utext_next32From(fInputText, 0);
2767        while (c != U_SENTINEL) {
2768            if (c<32 || c>256) {
2769                c = '.';
2770            }
2771            printf("%c", c);
2772
2773            c = UTEXT_NEXT32(fInputText);
2774        }
2775        printf("\n");
2776        printf("\n");
2777    }
2778    #endif
2779
2780    if (U_FAILURE(status)) {
2781        return;
2782    }
2783
2784    //  Cache frequently referenced items from the compiled pattern
2785    //
2786    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2787
2788    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2789    UVector             *sets          = fPattern->fSets;
2790
2791    fFrameSize = fPattern->fFrameSize;
2792    REStackFrame        *fp            = resetStack();
2793
2794    fp->fPatIdx   = 0;
2795    fp->fInputIdx = startIdx;
2796
2797    // Zero out the pattern's static data
2798    int32_t i;
2799    for (i = 0; i<fPattern->fDataSize; i++) {
2800        fData[i] = 0;
2801    }
2802
2803    //
2804    //  Main loop for interpreting the compiled pattern.
2805    //  One iteration of the loop per pattern operation performed.
2806    //
2807    for (;;) {
2808#if 0
2809        if (_heapchk() != _HEAPOK) {
2810            fprintf(stderr, "Heap Trouble\n");
2811        }
2812#endif
2813
2814        op      = (int32_t)pat[fp->fPatIdx];
2815        opType  = URX_TYPE(op);
2816        opValue = URX_VAL(op);
2817        #ifdef REGEX_RUN_DEBUG
2818        if (fTraceDebug) {
2819            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2820            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
2821                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2822            fPattern->dumpOp(fp->fPatIdx);
2823        }
2824        #endif
2825        fp->fPatIdx++;
2826
2827        switch (opType) {
2828
2829
2830        case URX_NOP:
2831            break;
2832
2833
2834        case URX_BACKTRACK:
2835            // Force a backtrack.  In some circumstances, the pattern compiler
2836            //   will notice that the pattern can't possibly match anything, and will
2837            //   emit one of these at that point.
2838            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2839            break;
2840
2841
2842        case URX_ONECHAR:
2843            if (fp->fInputIdx < fActiveLimit) {
2844                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2845                UChar32 c = UTEXT_NEXT32(fInputText);
2846                if (c == opValue) {
2847                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2848                    break;
2849                }
2850            } else {
2851                fHitEnd = TRUE;
2852            }
2853
2854            #ifdef REGEX_SMART_BACKTRACKING
2855            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
2856                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2857                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2858                    UBool success = FALSE;
2859                    UChar32 c = UTEXT_PREVIOUS32(fInputText);
2860                    while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
2861                        if (c == opValue) {
2862                            success = TRUE;
2863                            break;
2864                        } else if (c == U_SENTINEL) {
2865                            break;
2866                        }
2867                        c = UTEXT_PREVIOUS32(fInputText);
2868                    }
2869                    if (success) {
2870                        fHitEnd = FALSE;
2871                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2872                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2873                        if (fp->fInputIdx > backSearchIndex) {
2874                            fp = StateSave(fp, fp->fPatIdx, status);
2875                        }
2876                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
2877                        break;
2878                    }
2879                }
2880            }
2881            #endif
2882
2883            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2884            break;
2885
2886
2887        case URX_STRING:
2888            {
2889                // Test input against a literal string.
2890                // Strings require two slots in the compiled pattern, one for the
2891                //   offset to the string text, and one for the length.
2892                int32_t   stringStartIdx = opValue;
2893                int32_t   stringLen;
2894
2895                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2896                fp->fPatIdx++;
2897                opType    = URX_TYPE(op);
2898                stringLen = URX_VAL(op);
2899                U_ASSERT(opType == URX_STRING_LEN);
2900                U_ASSERT(stringLen >= 2);
2901
2902                const UChar *patternChars = litText+stringStartIdx;
2903                const UChar *patternEnd = patternChars+stringLen;
2904
2905                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2906                UChar32 c;
2907                UBool success = TRUE;
2908
2909                while (patternChars < patternEnd && success) {
2910                    c = UTEXT_NEXT32(fInputText);
2911
2912                    if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) {
2913                        if (U_IS_BMP(c)) {
2914                            success = (*patternChars == c);
2915                            patternChars += 1;
2916                        } else if (patternChars+1 < patternEnd) {
2917                            success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
2918                            patternChars += 2;
2919                        }
2920                    } else {
2921                        success = FALSE;
2922                        fHitEnd = TRUE;          //   TODO:  See ticket 6074
2923                    }
2924                }
2925
2926                if (success) {
2927                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2928                } else {
2929                    #ifdef REGEX_SMART_BACKTRACKING
2930                    if (fp->fInputIdx > backSearchIndex && fStack->size()) {
2931                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
2932                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
2933                            // Reset to last start point
2934                            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2935                            patternChars = litText+stringStartIdx;
2936
2937                            // Search backwards for a possible start
2938                            do {
2939                                c = UTEXT_PREVIOUS32(fInputText);
2940                                if (c == U_SENTINEL) {
2941                                    break;
2942                                } else if ((U_IS_BMP(c) && *patternChars == c) ||
2943                                    (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
2944                                    success = TRUE;
2945                                    break;
2946                                }
2947                            } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
2948
2949                            // And try again
2950                            if (success) {
2951                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2952                                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2953                                if (fp->fInputIdx > backSearchIndex) {
2954                                    fp = StateSave(fp, fp->fPatIdx, status);
2955                                }
2956                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
2957                                break;
2958                            }
2959                        }
2960                    }
2961                    #endif
2962                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2963                }
2964            }
2965            break;
2966
2967
2968        case URX_STATE_SAVE:
2969            fp = StateSave(fp, opValue, status);
2970            break;
2971
2972
2973        case URX_END:
2974            // The match loop will exit via this path on a successful match,
2975            //   when we reach the end of the pattern.
2976            if (toEnd && fp->fInputIdx != fActiveLimit) {
2977                // The pattern matched, but not to the end of input.  Try some more.
2978                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2979                break;
2980            }
2981            isMatch = TRUE;
2982            goto  breakFromLoop;
2983
2984        // Start and End Capture stack frame variables are laid out out like this:
2985            //  fp->fExtra[opValue]  - The start of a completed capture group
2986            //             opValue+1 - The end   of a completed capture group
2987            //             opValue+2 - the start of a capture group whose end
2988            //                          has not yet been reached (and might not ever be).
2989        case URX_START_CAPTURE:
2990            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2991            fp->fExtra[opValue+2] = fp->fInputIdx;
2992            break;
2993
2994
2995        case URX_END_CAPTURE:
2996            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2997            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2998            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2999            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
3000            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
3001            break;
3002
3003
3004        case URX_DOLLAR:                   //  $, test for End of line
3005                                           //     or for position before new line at end of input
3006            {
3007                if (fp->fInputIdx >= fAnchorLimit) {
3008                    // We really are at the end of input.  Success.
3009                    fHitEnd = TRUE;
3010                    fRequireEnd = TRUE;
3011                    break;
3012                }
3013
3014                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3015
3016                // If we are positioned just before a new-line that is located at the
3017                //   end of input, succeed.
3018                UChar32 c = UTEXT_NEXT32(fInputText);
3019                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3020                    if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
3021                        // If not in the middle of a CR/LF sequence
3022                        if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
3023                            // At new-line at end of input. Success
3024                            fHitEnd = TRUE;
3025                            fRequireEnd = TRUE;
3026
3027                            break;
3028                        }
3029                    }
3030                } else {
3031                    UChar32 nextC = UTEXT_NEXT32(fInputText);
3032                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
3033                        fHitEnd = TRUE;
3034                        fRequireEnd = TRUE;
3035                        break;                         // At CR/LF at end of input.  Success
3036                    }
3037                }
3038
3039                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3040            }
3041            break;
3042
3043
3044         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
3045            if (fp->fInputIdx >= fAnchorLimit) {
3046                // Off the end of input.  Success.
3047                fHitEnd = TRUE;
3048                fRequireEnd = TRUE;
3049                break;
3050            } else {
3051                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3052                UChar32 c = UTEXT_NEXT32(fInputText);
3053                // Either at the last character of input, or off the end.
3054                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3055                    fHitEnd = TRUE;
3056                    fRequireEnd = TRUE;
3057                    break;
3058                }
3059            }
3060
3061            // Not at end of input.  Back-track out.
3062            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3063            break;
3064
3065
3066         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3067             {
3068                 if (fp->fInputIdx >= fAnchorLimit) {
3069                     // We really are at the end of input.  Success.
3070                     fHitEnd = TRUE;
3071                     fRequireEnd = TRUE;
3072                     break;
3073                 }
3074                 // If we are positioned just before a new-line, succeed.
3075                 // It makes no difference where the new-line is within the input.
3076                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3077                 UChar32 c = UTEXT_CURRENT32(fInputText);
3078                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
3079                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3080                     //  In multi-line mode, hitting a new-line just before the end of input does not
3081                     //   set the hitEnd or requireEnd flags
3082                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3083                        break;
3084                     }
3085                 }
3086                 // not at a new line.  Fail.
3087                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3088             }
3089             break;
3090
3091
3092         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3093             {
3094                 if (fp->fInputIdx >= fAnchorLimit) {
3095                     // We really are at the end of input.  Success.
3096                     fHitEnd = TRUE;
3097                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3098                     break;               //   adding a new-line would not lose the match.
3099                 }
3100                 // If we are not positioned just before a new-line, the test fails; backtrack out.
3101                 // It makes no difference where the new-line is within the input.
3102                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3103                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3104                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3105                 }
3106             }
3107             break;
3108
3109
3110       case URX_CARET:                    //  ^, test for start of line
3111            if (fp->fInputIdx != fAnchorStart) {
3112                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3113            }
3114            break;
3115
3116
3117       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3118           {
3119               if (fp->fInputIdx == fAnchorStart) {
3120                   // We are at the start input.  Success.
3121                   break;
3122               }
3123               // Check whether character just before the current pos is a new-line
3124               //   unless we are at the end of input
3125               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3126               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3127               if ((fp->fInputIdx < fAnchorLimit) &&
3128                   ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3129                   //  It's a new-line.  ^ is true.  Success.
3130                   //  TODO:  what should be done with positions between a CR and LF?
3131                   break;
3132               }
3133               // Not at the start of a line.  Fail.
3134               fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3135           }
3136           break;
3137
3138
3139       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3140           {
3141               U_ASSERT(fp->fInputIdx >= fAnchorStart);
3142               if (fp->fInputIdx <= fAnchorStart) {
3143                   // We are at the start input.  Success.
3144                   break;
3145               }
3146               // Check whether character just before the current pos is a new-line
3147               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3148               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3149               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3150               if (c != 0x0a) {
3151                   // Not at the start of a line.  Back-track out.
3152                   fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3153               }
3154           }
3155           break;
3156
3157        case URX_BACKSLASH_B:          // Test for word boundaries
3158            {
3159                UBool success = isWordBoundary(fp->fInputIdx);
3160                success ^= (opValue != 0);     // flip sense for \B
3161                if (!success) {
3162                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3163                }
3164            }
3165            break;
3166
3167
3168        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3169            {
3170                UBool success = isUWordBoundary(fp->fInputIdx);
3171                success ^= (opValue != 0);     // flip sense for \B
3172                if (!success) {
3173                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3174                }
3175            }
3176            break;
3177
3178
3179        case URX_BACKSLASH_D:            // Test for decimal digit
3180            {
3181                if (fp->fInputIdx >= fActiveLimit) {
3182                    fHitEnd = TRUE;
3183                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3184                    break;
3185                }
3186
3187                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3188
3189                UChar32 c = UTEXT_NEXT32(fInputText);
3190                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3191                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3192                success ^= (opValue != 0);        // flip sense for \D
3193                if (success) {
3194                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3195                } else {
3196                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3197                }
3198            }
3199            break;
3200
3201
3202        case URX_BACKSLASH_G:          // Test for position at end of previous match
3203            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3204                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3205            }
3206            break;
3207
3208
3209        case URX_BACKSLASH_X:
3210            //  Match a Grapheme, as defined by Unicode TR 29.
3211            //  Differs slightly from Perl, which consumes combining marks independently
3212            //    of context.
3213            {
3214
3215                // Fail if at end of input
3216                if (fp->fInputIdx >= fActiveLimit) {
3217                    fHitEnd = TRUE;
3218                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3219                    break;
3220                }
3221
3222                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3223
3224                // Examine (and consume) the current char.
3225                //   Dispatch into a little state machine, based on the char.
3226                UChar32  c;
3227                c = UTEXT_NEXT32(fInputText);
3228                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3229                UnicodeSet **sets = fPattern->fStaticSets;
3230                if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3231                if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3232                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3233                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3234                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3235                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3236                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3237                goto GC_Extend;
3238
3239
3240
3241GC_L:
3242                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3243                c = UTEXT_NEXT32(fInputText);
3244                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3245                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3246                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3247                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3248                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3249                UTEXT_PREVIOUS32(fInputText);
3250                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3251                goto GC_Extend;
3252
3253GC_V:
3254                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3255                c = UTEXT_NEXT32(fInputText);
3256                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3257                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3258                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3259                UTEXT_PREVIOUS32(fInputText);
3260                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3261                goto GC_Extend;
3262
3263GC_T:
3264                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3265                c = UTEXT_NEXT32(fInputText);
3266                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3267                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3268                UTEXT_PREVIOUS32(fInputText);
3269                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3270                goto GC_Extend;
3271
3272GC_Extend:
3273                // Combining characters are consumed here
3274                for (;;) {
3275                    if (fp->fInputIdx >= fActiveLimit) {
3276                        break;
3277                    }
3278                    c = UTEXT_CURRENT32(fInputText);
3279                    if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3280                        break;
3281                    }
3282                    UTEXT_NEXT32(fInputText);
3283                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3284                }
3285                goto GC_Done;
3286
3287GC_Control:
3288                // Most control chars stand alone (don't combine with combining chars),
3289                //   except for that CR/LF sequence is a single grapheme cluster.
3290                if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3291                    c = UTEXT_NEXT32(fInputText);
3292                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3293                }
3294
3295GC_Done:
3296                if (fp->fInputIdx >= fActiveLimit) {
3297                    fHitEnd = TRUE;
3298                }
3299                break;
3300            }
3301
3302
3303
3304
3305        case URX_BACKSLASH_Z:          // Test for end of Input
3306            if (fp->fInputIdx < fAnchorLimit) {
3307                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3308            } else {
3309                fHitEnd = TRUE;
3310                fRequireEnd = TRUE;
3311            }
3312            break;
3313
3314
3315
3316        case URX_STATIC_SETREF:
3317            {
3318                // Test input character against one of the predefined sets
3319                //    (Word Characters, for example)
3320                // The high bit of the op value is a flag for the match polarity.
3321                //    0:   success if input char is in set.
3322                //    1:   success if input char is not in set.
3323                if (fp->fInputIdx >= fActiveLimit) {
3324                    fHitEnd = TRUE;
3325                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3326                    break;
3327                }
3328
3329                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3330                opValue &= ~URX_NEG_SET;
3331                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3332
3333                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3334                UChar32 c = UTEXT_NEXT32(fInputText);
3335                if (c < 256) {
3336                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3337                    if (s8->contains(c)) {
3338                        success = !success;
3339                    }
3340                } else {
3341                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3342                    if (s->contains(c)) {
3343                        success = !success;
3344                    }
3345                }
3346                if (success) {
3347                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3348                } else {
3349                    // the character wasn't in the set.
3350                    #ifdef REGEX_SMART_BACKTRACKING
3351                    if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3352                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3353                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3354                            // Try to find it, backwards
3355                            UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3356                            success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
3357                            do {
3358                                c = UTEXT_PREVIOUS32(fInputText);
3359                                if (c == U_SENTINEL) {
3360                                    break;
3361                                } else if (c < 256) {
3362                                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3363                                    if (s8->contains(c)) {
3364                                        success = !success;
3365                                    }
3366                                } else {
3367                                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3368                                    if (s->contains(c)) {
3369                                        success = !success;
3370                                    }
3371                                }
3372                            } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success);
3373
3374                            if (success && c != U_SENTINEL) {
3375                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3376                                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3377                                if (fp->fInputIdx > backSearchIndex) {
3378                                    fp = StateSave(fp, fp->fPatIdx, status);
3379                                }
3380                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
3381                                break;
3382                            }
3383                        }
3384                    }
3385                    #endif
3386                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3387                }
3388            }
3389            break;
3390
3391
3392        case URX_STAT_SETREF_N:
3393            {
3394                // Test input character for NOT being a member of  one of
3395                //    the predefined sets (Word Characters, for example)
3396                if (fp->fInputIdx >= fActiveLimit) {
3397                    fHitEnd = TRUE;
3398                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3399                    break;
3400                }
3401
3402                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3403
3404                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3405
3406                UChar32 c = UTEXT_NEXT32(fInputText);
3407                if (c < 256) {
3408                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3409                    if (s8->contains(c) == FALSE) {
3410                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3411                        break;
3412                    }
3413                } else {
3414                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3415                    if (s->contains(c) == FALSE) {
3416                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3417                        break;
3418                    }
3419                }
3420                // the character wasn't in the set.
3421                #ifdef REGEX_SMART_BACKTRACKING
3422                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3423                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3424                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3425                        // Try to find it, backwards
3426                        UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3427                        UBool success = FALSE;
3428                        do {
3429                            c = UTEXT_PREVIOUS32(fInputText);
3430                            if (c == U_SENTINEL) {
3431                                break;
3432                            } else if (c < 256) {
3433                                Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3434                                if (s8->contains(c) == FALSE) {
3435                                    success = TRUE;
3436                                    break;
3437                                }
3438                            } else {
3439                                const UnicodeSet *s = fPattern->fStaticSets[opValue];
3440                                if (s->contains(c) == FALSE) {
3441                                    success = TRUE;
3442                                    break;
3443                                }
3444                            }
3445                        } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3446
3447                        if (success) {
3448                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3449                            fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3450                            if (fp->fInputIdx > backSearchIndex) {
3451                                fp = StateSave(fp, fp->fPatIdx, status);
3452                            }
3453                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
3454                            break;
3455                        }
3456                    }
3457                }
3458                #endif
3459                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3460            }
3461            break;
3462
3463
3464        case URX_SETREF:
3465            if (fp->fInputIdx >= fActiveLimit) {
3466                fHitEnd = TRUE;
3467                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3468                break;
3469            } else {
3470                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3471
3472                // There is input left.  Pick up one char and test it for set membership.
3473                UChar32 c = UTEXT_NEXT32(fInputText);
3474                U_ASSERT(opValue > 0 && opValue < sets->size());
3475                if (c<256) {
3476                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3477                    if (s8->contains(c)) {
3478                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3479                        break;
3480                    }
3481                } else {
3482                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3483                    if (s->contains(c)) {
3484                        // The character is in the set.  A Match.
3485                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3486                        break;
3487                    }
3488                }
3489
3490                // the character wasn't in the set.
3491                #ifdef REGEX_SMART_BACKTRACKING
3492                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3493                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3494                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3495                        // Try to find it, backwards
3496                        UTEXT_PREVIOUS32(fInputText); // skip the first character we tried
3497                        UBool success = FALSE;
3498                        do {
3499                            c = UTEXT_PREVIOUS32(fInputText);
3500                            if (c == U_SENTINEL) {
3501                                break;
3502                            } else if (c < 256) {
3503                                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3504                                if (s8->contains(c)) {
3505                                    success = TRUE;
3506                                    break;
3507                                }
3508                            } else {
3509                                UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3510                                if (s->contains(c)) {
3511                                    success = TRUE;
3512                                    break;
3513                                }
3514                            }
3515                        } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
3516
3517                        if (success) {
3518                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3519                            fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3520                            if (fp->fInputIdx > backSearchIndex) {
3521                                fp = StateSave(fp, fp->fPatIdx, status);
3522                            }
3523                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
3524                            break;
3525                        }
3526                    }
3527                }
3528                #endif
3529                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3530            }
3531            break;
3532
3533
3534        case URX_DOTANY:
3535            {
3536                // . matches anything, but stops at end-of-line.
3537                if (fp->fInputIdx >= fActiveLimit) {
3538                    // At end of input.  Match failed.  Backtrack out.
3539                    fHitEnd = TRUE;
3540                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3541                    break;
3542                }
3543
3544                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3545
3546                // There is input left.  Advance over one char, unless we've hit end-of-line
3547                UChar32 c = UTEXT_NEXT32(fInputText);
3548                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
3549                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3550                    // End of line in normal mode.   . does not match.
3551                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3552                    break;
3553                }
3554                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3555            }
3556            break;
3557
3558
3559        case URX_DOTANY_ALL:
3560            {
3561                // ., in dot-matches-all (including new lines) mode
3562                if (fp->fInputIdx >= fActiveLimit) {
3563                    // At end of input.  Match failed.  Backtrack out.
3564                    fHitEnd = TRUE;
3565                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3566                    break;
3567                }
3568
3569                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3570
3571                // There is input left.  Advance over one char, except if we are
3572                //   at a cr/lf, advance over both of them.
3573                UChar32 c;
3574                c = UTEXT_NEXT32(fInputText);
3575                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3576                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3577                    // In the case of a CR/LF, we need to advance over both.
3578                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
3579                    if (nextc == 0x0a) {
3580                        UTEXT_NEXT32(fInputText);
3581                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3582                    }
3583                }
3584            }
3585            break;
3586
3587
3588        case URX_DOTANY_UNIX:
3589            {
3590                // '.' operator, matches all, but stops at end-of-line.
3591                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3592                if (fp->fInputIdx >= fActiveLimit) {
3593                    // At end of input.  Match failed.  Backtrack out.
3594                    fHitEnd = TRUE;
3595                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3596                    break;
3597                }
3598
3599                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3600
3601                // There is input left.  Advance over one char, unless we've hit end-of-line
3602                UChar32 c = UTEXT_NEXT32(fInputText);
3603                if (c == 0x0a) {
3604                    // End of line in normal mode.   '.' does not match the \n
3605                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3606                } else {
3607                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3608                }
3609            }
3610            break;
3611
3612
3613        case URX_JMP:
3614            fp->fPatIdx = opValue;
3615            break;
3616
3617        case URX_FAIL:
3618            isMatch = FALSE;
3619            goto breakFromLoop;
3620
3621        case URX_JMP_SAV:
3622            U_ASSERT(opValue < fPattern->fCompiledPat->size());
3623            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3624            fp->fPatIdx = opValue;                         // Then JMP.
3625            break;
3626
3627        case URX_JMP_SAV_X:
3628            // This opcode is used with (x)+, when x can match a zero length string.
3629            // Same as JMP_SAV, except conditional on the match having made forward progress.
3630            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3631            //   data address of the input position at the start of the loop.
3632            {
3633                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3634                int32_t  stoOp = (int32_t)pat[opValue-1];
3635                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3636                int32_t  frameLoc = URX_VAL(stoOp);
3637                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3638                int64_t prevInputIdx = fp->fExtra[frameLoc];
3639                U_ASSERT(prevInputIdx <= fp->fInputIdx);
3640                if (prevInputIdx < fp->fInputIdx) {
3641                    // The match did make progress.  Repeat the loop.
3642                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3643                    fp->fPatIdx = opValue;
3644                    fp->fExtra[frameLoc] = fp->fInputIdx;
3645                }
3646                // If the input position did not advance, we do nothing here,
3647                //   execution will fall out of the loop.
3648            }
3649            break;
3650
3651        case URX_CTR_INIT:
3652            {
3653                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3654                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3655
3656                // Pick up the three extra operands that CTR_INIT has, and
3657                //    skip the pattern location counter past
3658                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3659                fp->fPatIdx += 3;
3660                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3661                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3662                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3663                U_ASSERT(minCount>=0);
3664                U_ASSERT(maxCount>=minCount || maxCount==-1);
3665                U_ASSERT(loopLoc>fp->fPatIdx);
3666
3667                if (minCount == 0) {
3668                    fp = StateSave(fp, loopLoc+1, status);
3669                }
3670                if (maxCount == 0) {
3671                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3672                }
3673            }
3674            break;
3675
3676        case URX_CTR_LOOP:
3677            {
3678                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3679                int32_t initOp = (int32_t)pat[opValue];
3680                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3681                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3682                int32_t minCount  = (int32_t)pat[opValue+2];
3683                int32_t maxCount  = (int32_t)pat[opValue+3];
3684                // Increment the counter.  Note: we DIDN'T worry about counter
3685                //   overflow, since the data comes from UnicodeStrings, which
3686                //   stores its length in an int32_t. Do we have to think about
3687                //   this now that we're using UText? Probably not, since the length
3688                //   in UChar32s is still an int32_t.
3689                (*pCounter)++;
3690                U_ASSERT(*pCounter > 0);
3691                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3692                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
3693                    break;
3694                }
3695                if (*pCounter >= minCount) {
3696                    fp = StateSave(fp, fp->fPatIdx, status);
3697                }
3698                fp->fPatIdx = opValue + 4;    // Loop back.
3699            }
3700            break;
3701
3702        case URX_CTR_INIT_NG:
3703            {
3704                // Initialize a non-greedy loop
3705                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3706                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
3707
3708                // Pick up the three extra operands that CTR_INIT has, and
3709                //    skip the pattern location counter past
3710                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3711                fp->fPatIdx += 3;
3712                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3713                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3714                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3715                U_ASSERT(minCount>=0);
3716                U_ASSERT(maxCount>=minCount || maxCount==-1);
3717                U_ASSERT(loopLoc>fp->fPatIdx);
3718
3719                if (minCount == 0) {
3720                    if (maxCount != 0) {
3721                        fp = StateSave(fp, fp->fPatIdx, status);
3722                    }
3723                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3724                }
3725            }
3726            break;
3727
3728        case URX_CTR_LOOP_NG:
3729            {
3730                // Non-greedy {min, max} loops
3731                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3732                int32_t initOp = (int32_t)pat[opValue];
3733                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3734                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3735                int32_t minCount  = (int32_t)pat[opValue+2];
3736                int32_t maxCount  = (int32_t)pat[opValue+3];
3737                // Increment the counter.  Note: we DIDN'T worry about counter
3738                //   overflow, since the data comes from UnicodeStrings, which
3739                //   stores its length in an int32_t. Do we have to think about
3740                //   this now that we're using UText? Probably not, since the length
3741                //   in UChar32s is still an int32_t.
3742                (*pCounter)++;
3743                U_ASSERT(*pCounter > 0);
3744
3745                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
3746                    // The loop has matched the maximum permitted number of times.
3747                    //   Break out of here with no action.  Matching will
3748                    //   continue with the following pattern.
3749                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
3750                    break;
3751                }
3752
3753                if (*pCounter < minCount) {
3754                    // We haven't met the minimum number of matches yet.
3755                    //   Loop back for another one.
3756                    fp->fPatIdx = opValue + 4;    // Loop back.
3757                } else {
3758                    // We do have the minimum number of matches.
3759                    //   Fall into the following pattern, but first do
3760                    //   a state save to the top of the loop, so that a failure
3761                    //   in the following pattern will try another iteration of the loop.
3762                    fp = StateSave(fp, opValue + 4, status);
3763                }
3764            }
3765            break;
3766
3767        case URX_STO_SP:
3768            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3769            fData[opValue] = fStack->size();
3770            break;
3771
3772        case URX_LD_SP:
3773            {
3774                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3775                int32_t newStackSize = (int32_t)fData[opValue];
3776                U_ASSERT(newStackSize <= fStack->size());
3777                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3778                if (newFP == (int64_t *)fp) {
3779                    break;
3780                }
3781                int32_t i;
3782                for (i=0; i<fFrameSize; i++) {
3783                    newFP[i] = ((int64_t *)fp)[i];
3784                }
3785                fp = (REStackFrame *)newFP;
3786                fStack->setSize(newStackSize);
3787            }
3788            break;
3789
3790        case URX_BACKREF:
3791        case URX_BACKREF_I:
3792            {
3793                U_ASSERT(opValue < fFrameSize);
3794                int64_t groupStartIdx = fp->fExtra[opValue];
3795                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3796                U_ASSERT(groupStartIdx <= groupEndIdx);
3797                if (groupStartIdx < 0) {
3798                    // This capture group has not participated in the match thus far,
3799                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3800                }
3801
3802                if (groupEndIdx == groupStartIdx) {
3803                    //   The capture group match was of an empty string.
3804                    //   Verified by testing:  Perl matches succeed in this case, so
3805                    //   we do too.
3806                    break;
3807                }
3808
3809                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3810                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3811
3812                UBool haveMatch = (opType == URX_BACKREF ?
3813                    (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) :
3814                    (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status)));
3815                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3816
3817                if (fp->fInputIdx > fActiveLimit) {
3818                    fHitEnd = TRUE;
3819                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3820                } else if (!haveMatch) {
3821                    if (fp->fInputIdx == fActiveLimit) {
3822                        fHitEnd = TRUE;
3823                    }
3824                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3825                }
3826            }
3827            break;
3828
3829        case URX_STO_INP_LOC:
3830            {
3831                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3832                fp->fExtra[opValue] = fp->fInputIdx;
3833            }
3834            break;
3835
3836        case URX_JMPX:
3837            {
3838                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3839                fp->fPatIdx += 1;
3840                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3841                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3842                int64_t savedInputIdx = fp->fExtra[dataLoc];
3843                U_ASSERT(savedInputIdx <= fp->fInputIdx);
3844                if (savedInputIdx < fp->fInputIdx) {
3845                    fp->fPatIdx = opValue;                               // JMP
3846                } else {
3847                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3848                }
3849            }
3850            break;
3851
3852        case URX_LA_START:
3853            {
3854                // Entering a lookahead block.
3855                // Save Stack Ptr, Input Pos.
3856                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3857                fData[opValue]   = fStack->size();
3858                fData[opValue+1] = fp->fInputIdx;
3859                fActiveStart     = fLookStart;          // Set the match region change for
3860                fActiveLimit     = fLookLimit;          //   transparent bounds.
3861            }
3862            break;
3863
3864        case URX_LA_END:
3865            {
3866                // Leaving a look-ahead block.
3867                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3868                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3869                int32_t stackSize = fStack->size();
3870                int32_t newStackSize =(int32_t)fData[opValue];
3871                U_ASSERT(stackSize >= newStackSize);
3872                if (stackSize > newStackSize) {
3873                    // Copy the current top frame back to the new (cut back) top frame.
3874                    //   This makes the capture groups from within the look-ahead
3875                    //   expression available.
3876                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3877                    int32_t i;
3878                    for (i=0; i<fFrameSize; i++) {
3879                        newFP[i] = ((int64_t *)fp)[i];
3880                    }
3881                    fp = (REStackFrame *)newFP;
3882                    fStack->setSize(newStackSize);
3883                }
3884                fp->fInputIdx = fData[opValue+1];
3885
3886                // Restore the active region bounds in the input string; they may have
3887                //    been changed because of transparent bounds on a Region.
3888                fActiveStart = fRegionStart;
3889                fActiveLimit = fRegionLimit;
3890            }
3891            break;
3892
3893        case URX_ONECHAR_I:
3894            if (fp->fInputIdx < fActiveLimit) {
3895                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3896
3897                UChar32 c = UTEXT_NEXT32(fInputText);
3898                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3899                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3900                    break;
3901                }
3902            } else {
3903                fHitEnd = TRUE;
3904            }
3905
3906            #ifdef REGEX_SMART_BACKTRACKING
3907            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
3908                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
3909                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
3910                    UBool success = FALSE;
3911                    UChar32 c = UTEXT_PREVIOUS32(fInputText);
3912                    while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) {
3913                        if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3914                            success = TRUE;
3915                            break;
3916                        } else if (c == U_SENTINEL) {
3917                            break;
3918                        }
3919                        c = UTEXT_PREVIOUS32(fInputText);
3920                    }
3921                    if (success) {
3922                        fHitEnd = FALSE;
3923                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3924                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3925                        if (fp->fInputIdx > backSearchIndex) {
3926                            fp = StateSave(fp, fp->fPatIdx, status);
3927                        }
3928                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
3929                        break;
3930                    }
3931                }
3932            }
3933            #endif
3934
3935            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3936            break;
3937
3938        case URX_STRING_I:
3939            {
3940                // Test input against a literal string.
3941                // Strings require two slots in the compiled pattern, one for the
3942                //   offset to the string text, and one for the length.
3943                const UCaseProps *csp = ucase_getSingleton();
3944                {
3945                    int32_t stringStartIdx, stringLen;
3946                    stringStartIdx = opValue;
3947
3948                    op      = (int32_t)pat[fp->fPatIdx];
3949                    fp->fPatIdx++;
3950                    opType  = URX_TYPE(op);
3951                    opValue = URX_VAL(op);
3952                    U_ASSERT(opType == URX_STRING_LEN);
3953                    stringLen = opValue;
3954
3955                    const UChar *patternChars = litText+stringStartIdx;
3956                    const UChar *patternEnd = patternChars+stringLen;
3957
3958                    const UChar *foldChars = NULL;
3959                    int32_t foldOffset, foldLength;
3960                    UChar32 c;
3961
3962                    foldOffset = foldLength = 0;
3963                    UBool success = TRUE;
3964
3965                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3966                    while (patternChars < patternEnd && success) {
3967                        if(foldOffset < foldLength) {
3968                            U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3969                        } else {
3970                            c = UTEXT_NEXT32(fInputText);
3971                            if (c != U_SENTINEL) {
3972                                foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
3973                                if(foldLength >= 0) {
3974                                    if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
3975                                        foldOffset = 0;
3976                                        U16_NEXT_UNSAFE(foldChars, foldOffset, c);
3977                                    } else {
3978                                        c = foldLength;
3979                                        foldLength = foldOffset; // to avoid reading chars from the folding buffer
3980                                    }
3981                                }
3982                            }
3983
3984                            fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3985                        }
3986
3987                        success = FALSE;
3988                        if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) {
3989                            if (U_IS_BMP(c)) {
3990                                success = (*patternChars == c);
3991                                patternChars += 1;
3992                            } else if (patternChars+1 < patternEnd) {
3993                                success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
3994                                patternChars += 2;
3995                            }
3996                        } else {
3997                            fHitEnd = TRUE;          //   TODO:  See ticket 6074
3998                        }
3999                    }
4000
4001                    if (!success) {
4002                        #ifdef REGEX_SMART_BACKTRACKING
4003                        if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4004                            REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4005                            if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4006                                // Reset to last start point
4007                                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4008                                patternChars = litText+stringStartIdx;
4009
4010                                // Search backwards for a possible start
4011                                do {
4012                                    c = UTEXT_PREVIOUS32(fInputText);
4013                                    if (c == U_SENTINEL) {
4014                                        break;
4015                                    } else {
4016                                        foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
4017                                        if(foldLength >= 0) {
4018                                            if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
4019                                                foldOffset = 0;
4020                                                U16_NEXT_UNSAFE(foldChars, foldOffset, c);
4021                                            } else {
4022                                                c = foldLength;
4023                                                foldLength = foldOffset; // to avoid reading chars from the folding buffer
4024                                            }
4025                                        }
4026
4027                                        if ((U_IS_BMP(c) && *patternChars == c) ||
4028                                               (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
4029                                            success = TRUE;
4030                                            break;
4031                                        }
4032                                    }
4033                                } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex);
4034
4035                                // And try again
4036                                if (success) {
4037                                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4038                                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4039                                    if (fp->fInputIdx > backSearchIndex) {
4040                                        fp = StateSave(fp, fp->fPatIdx, status);
4041                                    }
4042                                    fp->fPatIdx++; // Skip the LOOP_C, we just did that
4043                                    break;
4044                                }
4045                            }
4046                        }
4047                        #endif
4048                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4049                    }
4050                }
4051            }
4052            break;
4053
4054        case URX_LB_START:
4055            {
4056                // Entering a look-behind block.
4057                // Save Stack Ptr, Input Pos.
4058                //   TODO:  implement transparent bounds.  Ticket #6067
4059                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4060                fData[opValue]   = fStack->size();
4061                fData[opValue+1] = fp->fInputIdx;
4062                // Init the variable containing the start index for attempted matches.
4063                fData[opValue+2] = -1;
4064                // Save input string length, then reset to pin any matches to end at
4065                //   the current position.
4066                fData[opValue+3] = fActiveLimit;
4067                fActiveLimit     = fp->fInputIdx;
4068            }
4069            break;
4070
4071
4072        case URX_LB_CONT:
4073            {
4074                // Positive Look-Behind, at top of loop checking for matches of LB expression
4075                //    at all possible input starting positions.
4076
4077                // Fetch the min and max possible match lengths.  They are the operands
4078                //   of this op in the pattern.
4079                int32_t minML = (int32_t)pat[fp->fPatIdx++];
4080                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4081                U_ASSERT(minML <= maxML);
4082                U_ASSERT(minML >= 0);
4083
4084                // Fetch (from data) the last input index where a match was attempted.
4085                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4086                int64_t  *lbStartIdx = &fData[opValue+2];
4087                if (*lbStartIdx < 0) {
4088                    // First time through loop.
4089                    *lbStartIdx = fp->fInputIdx - minML;
4090                } else {
4091                    // 2nd through nth time through the loop.
4092                    // Back up start position for match by one.
4093                    if (*lbStartIdx == 0) {
4094                        (*lbStartIdx)--;
4095                    } else {
4096                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4097                        UTEXT_PREVIOUS32(fInputText);
4098                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4099                    }
4100                }
4101
4102                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4103                    // We have tried all potential match starting points without
4104                    //  getting a match.  Backtrack out, and out of the
4105                    //   Look Behind altogether.
4106                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4107                    int64_t restoreInputLen = fData[opValue+3];
4108                    U_ASSERT(restoreInputLen >= fActiveLimit);
4109                    U_ASSERT(restoreInputLen <= fInputLength);
4110                    fActiveLimit = restoreInputLen;
4111                    break;
4112                }
4113
4114                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4115                //      (successful match will fall off the end of the loop.)
4116                fp = StateSave(fp, fp->fPatIdx-3, status);
4117                fp->fInputIdx = *lbStartIdx;
4118            }
4119            break;
4120
4121        case URX_LB_END:
4122            // End of a look-behind block, after a successful match.
4123            {
4124                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4125                if (fp->fInputIdx != fActiveLimit) {
4126                    //  The look-behind expression matched, but the match did not
4127                    //    extend all the way to the point that we are looking behind from.
4128                    //  FAIL out of here, which will take us back to the LB_CONT, which
4129                    //     will retry the match starting at another position or fail
4130                    //     the look-behind altogether, whichever is appropriate.
4131                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4132                    break;
4133                }
4134
4135                // Look-behind match is good.  Restore the orignal input string length,
4136                //   which had been truncated to pin the end of the lookbehind match to the
4137                //   position being looked-behind.
4138                int64_t originalInputLen = fData[opValue+3];
4139                U_ASSERT(originalInputLen >= fActiveLimit);
4140                U_ASSERT(originalInputLen <= fInputLength);
4141                fActiveLimit = originalInputLen;
4142            }
4143            break;
4144
4145
4146        case URX_LBN_CONT:
4147            {
4148                // Negative Look-Behind, at top of loop checking for matches of LB expression
4149                //    at all possible input starting positions.
4150
4151                // Fetch the extra parameters of this op.
4152                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4153                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4154                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4155                        continueLoc = URX_VAL(continueLoc);
4156                U_ASSERT(minML <= maxML);
4157                U_ASSERT(minML >= 0);
4158                U_ASSERT(continueLoc > fp->fPatIdx);
4159
4160                // Fetch (from data) the last input index where a match was attempted.
4161                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4162                int64_t  *lbStartIdx = &fData[opValue+2];
4163                if (*lbStartIdx < 0) {
4164                    // First time through loop.
4165                    *lbStartIdx = fp->fInputIdx - minML;
4166                } else {
4167                    // 2nd through nth time through the loop.
4168                    // Back up start position for match by one.
4169                    if (*lbStartIdx == 0) {
4170                        (*lbStartIdx)--;
4171                    } else {
4172                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4173                        UTEXT_PREVIOUS32(fInputText);
4174                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4175                    }
4176                }
4177
4178                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4179                    // We have tried all potential match starting points without
4180                    //  getting a match, which means that the negative lookbehind as
4181                    //  a whole has succeeded.  Jump forward to the continue location
4182                    int64_t restoreInputLen = fData[opValue+3];
4183                    U_ASSERT(restoreInputLen >= fActiveLimit);
4184                    U_ASSERT(restoreInputLen <= fInputLength);
4185                    fActiveLimit = restoreInputLen;
4186                    fp->fPatIdx = continueLoc;
4187                    break;
4188                }
4189
4190                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4191                //      (successful match will cause a FAIL out of the loop altogether.)
4192                fp = StateSave(fp, fp->fPatIdx-4, status);
4193                fp->fInputIdx = *lbStartIdx;
4194            }
4195            break;
4196
4197        case URX_LBN_END:
4198            // End of a negative look-behind block, after a successful match.
4199            {
4200                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4201                if (fp->fInputIdx != fActiveLimit) {
4202                    //  The look-behind expression matched, but the match did not
4203                    //    extend all the way to the point that we are looking behind from.
4204                    //  FAIL out of here, which will take us back to the LB_CONT, which
4205                    //     will retry the match starting at another position or succeed
4206                    //     the look-behind altogether, whichever is appropriate.
4207                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4208                    break;
4209                }
4210
4211                // Look-behind expression matched, which means look-behind test as
4212                //   a whole Fails
4213
4214                //   Restore the orignal input string length, which had been truncated
4215                //   inorder to pin the end of the lookbehind match
4216                //   to the position being looked-behind.
4217                int64_t originalInputLen = fData[opValue+3];
4218                U_ASSERT(originalInputLen >= fActiveLimit);
4219                U_ASSERT(originalInputLen <= fInputLength);
4220                fActiveLimit = originalInputLen;
4221
4222                // Restore original stack position, discarding any state saved
4223                //   by the successful pattern match.
4224                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4225                int32_t newStackSize = (int32_t)fData[opValue];
4226                U_ASSERT(fStack->size() > newStackSize);
4227                fStack->setSize(newStackSize);
4228
4229                //  FAIL, which will take control back to someplace
4230                //  prior to entering the look-behind test.
4231                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4232            }
4233            break;
4234
4235
4236        case URX_LOOP_SR_I:
4237            // Loop Initialization for the optimized implementation of
4238            //     [some character set]*
4239            //   This op scans through all matching input.
4240            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4241            {
4242                U_ASSERT(opValue > 0 && opValue < sets->size());
4243                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4244                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4245
4246                // Loop through input, until either the input is exhausted or
4247                //   we reach a character that is not a member of the set.
4248                int64_t ix = fp->fInputIdx;
4249                UTEXT_SETNATIVEINDEX(fInputText, ix);
4250                for (;;) {
4251                    if (ix >= fActiveLimit) {
4252                        fHitEnd = TRUE;
4253                        break;
4254                    }
4255                    UChar32 c = UTEXT_NEXT32(fInputText);
4256                    if (c<256) {
4257                        if (s8->contains(c) == FALSE) {
4258                            break;
4259                        }
4260                    } else {
4261                        if (s->contains(c) == FALSE) {
4262                            break;
4263                        }
4264                    }
4265                    ix = UTEXT_GETNATIVEINDEX(fInputText);
4266                }
4267
4268                // If there were no matching characters, skip over the loop altogether.
4269                //   The loop doesn't run at all, a * op always succeeds.
4270                if (ix == fp->fInputIdx) {
4271                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4272                    break;
4273                }
4274
4275                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4276                //   must follow.  It's operand is the stack location
4277                //   that holds the starting input index for the match of this [set]*
4278                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4279                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4280                int32_t stackLoc = URX_VAL(loopcOp);
4281                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4282                fp->fExtra[stackLoc] = fp->fInputIdx;
4283                #ifdef REGEX_SMART_BACKTRACKING
4284                backSearchIndex = fp->fInputIdx;
4285                #endif
4286                fp->fInputIdx = ix;
4287
4288                // Save State to the URX_LOOP_C op that follows this one,
4289                //   so that match failures in the following code will return to there.
4290                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4291                fp = StateSave(fp, fp->fPatIdx, status);
4292                fp->fPatIdx++;
4293            }
4294            break;
4295
4296
4297        case URX_LOOP_DOT_I:
4298            // Loop Initialization for the optimized implementation of .*
4299            //   This op scans through all remaining input.
4300            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4301            {
4302                // Loop through input until the input is exhausted (we reach an end-of-line)
4303                // In DOTALL mode, we can just go straight to the end of the input.
4304                int64_t ix;
4305                if ((opValue & 1) == 1) {
4306                    // Dot-matches-All mode.  Jump straight to the end of the string.
4307                    ix = fActiveLimit;
4308                    fHitEnd = TRUE;
4309                } else {
4310                    // NOT DOT ALL mode.  Line endings do not match '.'
4311                    // Scan forward until a line ending or end of input.
4312                    ix = fp->fInputIdx;
4313                    UTEXT_SETNATIVEINDEX(fInputText, ix);
4314                    for (;;) {
4315                        if (ix >= fActiveLimit) {
4316                            fHitEnd = TRUE;
4317                            break;
4318                        }
4319                        UChar32 c = UTEXT_NEXT32(fInputText);
4320                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4321                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4322                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4323                                    (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4324                                //  char is a line ending.  Exit the scanning loop.
4325                                break;
4326                            }
4327                        }
4328                        ix = UTEXT_GETNATIVEINDEX(fInputText);
4329                    }
4330                }
4331
4332                // If there were no matching characters, skip over the loop altogether.
4333                //   The loop doesn't run at all, a * op always succeeds.
4334                if (ix == fp->fInputIdx) {
4335                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4336                    break;
4337                }
4338
4339                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4340                //   must follow.  It's operand is the stack location
4341                //   that holds the starting input index for the match of this .*
4342                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4343                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4344                int32_t stackLoc = URX_VAL(loopcOp);
4345                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4346                fp->fExtra[stackLoc] = fp->fInputIdx;
4347                #ifdef REGEX_SMART_BACKTRACKING
4348                backSearchIndex = fp->fInputIdx;
4349                #endif
4350                fp->fInputIdx = ix;
4351
4352                // Save State to the URX_LOOP_C op that follows this one,
4353                //   so that match failures in the following code will return to there.
4354                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4355                fp = StateSave(fp, fp->fPatIdx, status);
4356                fp->fPatIdx++;
4357            }
4358            break;
4359
4360
4361        case URX_LOOP_C:
4362            {
4363                U_ASSERT(opValue>=0 && opValue<fFrameSize);
4364                backSearchIndex = fp->fExtra[opValue];
4365                U_ASSERT(backSearchIndex <= fp->fInputIdx);
4366                if (backSearchIndex == fp->fInputIdx) {
4367                    // We've backed up the input idx to the point that the loop started.
4368                    // The loop is done.  Leave here without saving state.
4369                    //  Subsequent failures won't come back here.
4370                    break;
4371                }
4372                // Set up for the next iteration of the loop, with input index
4373                //   backed up by one from the last time through,
4374                //   and a state save to this instruction in case the following code fails again.
4375                //   (We're going backwards because this loop emulates stack unwinding, not
4376                //    the initial scan forward.)
4377                U_ASSERT(fp->fInputIdx > 0);
4378                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4379                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4380                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4381
4382                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4383                if (prevC == 0x0a &&
4384                    fp->fInputIdx > backSearchIndex &&
4385                    twoPrevC == 0x0d) {
4386                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4387                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4388                        // .*, stepping back over CRLF pair.
4389                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4390                    }
4391                }
4392
4393
4394                fp = StateSave(fp, fp->fPatIdx-1, status);
4395            }
4396            break;
4397
4398
4399
4400        default:
4401            // Trouble.  The compiled pattern contains an entry with an
4402            //           unrecognized type tag.
4403            U_ASSERT(FALSE);
4404        }
4405
4406        if (U_FAILURE(status)) {
4407            isMatch = FALSE;
4408            break;
4409        }
4410    }
4411
4412breakFromLoop:
4413    fMatch = isMatch;
4414    if (isMatch) {
4415        fLastMatchEnd = fMatchEnd;
4416        fMatchStart   = startIdx;
4417        fMatchEnd     = fp->fInputIdx;
4418        if (fTraceDebug) {
4419            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
4420        }
4421    }
4422    else
4423    {
4424        if (fTraceDebug) {
4425            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
4426        }
4427    }
4428
4429    fFrame = fp;                // The active stack frame when the engine stopped.
4430                                //   Contains the capture group results that we need to
4431                                //    access later.
4432    return;
4433}
4434
4435
4436//--------------------------------------------------------------------------------
4437//
4438//   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4439//                  assumption that the entire string is available in the UText's
4440//                  chunk buffer. For now, that means we can use int32_t indexes,
4441//                  except for anything that needs to be saved (like group starts
4442//                  and ends).
4443//
4444//                  startIdx:    begin matching a this index.
4445//                  toEnd:       if true, match must extend to end of the input region
4446//
4447//--------------------------------------------------------------------------------
4448void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4449    UBool       isMatch  = FALSE;      // True if the we have a match.
4450
4451    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4452
4453    int32_t     op;                    // Operation from the compiled pattern, split into
4454    int32_t     opType;                //    the opcode
4455    int32_t     opValue;               //    and the operand value.
4456
4457#ifdef REGEX_RUN_DEBUG
4458    if (fTraceDebug)
4459    {
4460        printf("MatchAt(startIdx=%ld)\n", startIdx);
4461        printf("Original Pattern: ");
4462        UChar32 c = utext_next32From(fPattern->fPattern, 0);
4463        while (c != U_SENTINEL) {
4464            if (c<32 || c>256) {
4465                c = '.';
4466            }
4467            REGEX_DUMP_DEBUG_PRINTF(("%c", c));
4468
4469            c = UTEXT_NEXT32(fPattern->fPattern);
4470        }
4471        printf("\n");
4472        printf("Input String: ");
4473        c = utext_next32From(fInputText, 0);
4474        while (c != U_SENTINEL) {
4475            if (c<32 || c>256) {
4476                c = '.';
4477            }
4478            printf("%c", c);
4479
4480            c = UTEXT_NEXT32(fInputText);
4481        }
4482        printf("\n");
4483        printf("\n");
4484    }
4485#endif
4486
4487    if (U_FAILURE(status)) {
4488        return;
4489    }
4490
4491    //  Cache frequently referenced items from the compiled pattern
4492    //
4493    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4494
4495    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4496    UVector             *sets          = fPattern->fSets;
4497
4498    const UChar         *inputBuf      = fInputText->chunkContents;
4499
4500    fFrameSize = fPattern->fFrameSize;
4501    REStackFrame        *fp            = resetStack();
4502
4503    fp->fPatIdx   = 0;
4504    fp->fInputIdx = startIdx;
4505
4506    // Zero out the pattern's static data
4507    int32_t i;
4508    for (i = 0; i<fPattern->fDataSize; i++) {
4509        fData[i] = 0;
4510    }
4511
4512    //
4513    //  Main loop for interpreting the compiled pattern.
4514    //  One iteration of the loop per pattern operation performed.
4515    //
4516    for (;;) {
4517#if 0
4518        if (_heapchk() != _HEAPOK) {
4519            fprintf(stderr, "Heap Trouble\n");
4520        }
4521#endif
4522
4523        op      = (int32_t)pat[fp->fPatIdx];
4524        opType  = URX_TYPE(op);
4525        opValue = URX_VAL(op);
4526#ifdef REGEX_RUN_DEBUG
4527        if (fTraceDebug) {
4528            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4529            printf("inputIdx=%d   inputChar=%x   sp=%3d   activeLimit=%d  ", fp->fInputIdx,
4530                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4531            fPattern->dumpOp(fp->fPatIdx);
4532        }
4533#endif
4534        fp->fPatIdx++;
4535
4536        switch (opType) {
4537
4538
4539        case URX_NOP:
4540            break;
4541
4542
4543        case URX_BACKTRACK:
4544            // Force a backtrack.  In some circumstances, the pattern compiler
4545            //   will notice that the pattern can't possibly match anything, and will
4546            //   emit one of these at that point.
4547            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4548            break;
4549
4550
4551        case URX_ONECHAR:
4552            if (fp->fInputIdx < fActiveLimit) {
4553                UChar32 c;
4554                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4555                if (c == opValue) {
4556                    break;
4557                }
4558            } else {
4559                fHitEnd = TRUE;
4560            }
4561
4562            #ifdef REGEX_SMART_BACKTRACKING
4563            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
4564                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4565                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4566                    int64_t reverseIndex = fp->fInputIdx;
4567                    UChar32 c;
4568                    do {
4569                        U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4570                        if (c == opValue) {
4571                            break;
4572                        }
4573                    } while (reverseIndex > backSearchIndex);
4574                    if (c == opValue) {
4575                        fHitEnd = FALSE;
4576                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4577                        fp->fInputIdx = reverseIndex;
4578                        if (fp->fInputIdx > backSearchIndex) {
4579                            fp = StateSave(fp, fp->fPatIdx, status);
4580                        }
4581                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
4582                        break;
4583                    }
4584                }
4585            }
4586            #endif
4587
4588            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4589            break;
4590
4591
4592        case URX_STRING:
4593            {
4594                // Test input against a literal string.
4595                // Strings require two slots in the compiled pattern, one for the
4596                //   offset to the string text, and one for the length.
4597                int32_t   stringStartIdx = opValue;
4598                int32_t   stringLen;
4599
4600                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4601                fp->fPatIdx++;
4602                opType    = URX_TYPE(op);
4603                stringLen = URX_VAL(op);
4604                U_ASSERT(opType == URX_STRING_LEN);
4605                U_ASSERT(stringLen >= 2);
4606
4607                if (fp->fInputIdx + stringLen > fActiveLimit) {
4608                    // No match.  String is longer than the remaining input text.
4609                    fHitEnd = TRUE;          //   TODO:  See ticket 6074
4610                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4611                    break;
4612                }
4613
4614                const UChar * pInp = inputBuf + fp->fInputIdx;
4615                const UChar * pPat = litText+stringStartIdx;
4616                const UChar * pEnd = pInp + stringLen;
4617                UBool success = FALSE;
4618                for(;;) {
4619                    if (*pInp == *pPat) {
4620                        pInp++;
4621                        pPat++;
4622                        if (pInp == pEnd) {
4623                            // Successful Match.
4624                            success = TRUE;
4625                            break;
4626                        }
4627                    } else {
4628                        // Match failed.
4629                        break;
4630                    }
4631                }
4632
4633                if (success) {
4634                    fp->fInputIdx += stringLen;
4635                } else {
4636                    #ifdef REGEX_SMART_BACKTRACKING
4637                    if (fp->fInputIdx > backSearchIndex && fStack->size()) {
4638                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
4639                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
4640                            // Reset to last start point
4641                            int64_t reverseIndex = fp->fInputIdx;
4642                            UChar32 c;
4643                            pPat = litText+stringStartIdx;
4644
4645                            // Search backwards for a possible start
4646                            do {
4647                                U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
4648                                if ((U_IS_BMP(c) && *pPat == c) ||
4649                                    (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) {
4650                                    success = TRUE;
4651                                    break;
4652                                }
4653                            } while (reverseIndex > backSearchIndex);
4654
4655                            // And try again
4656                            if (success) {
4657                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4658                                fp->fInputIdx = reverseIndex;
4659                                if (fp->fInputIdx > backSearchIndex) {
4660                                    fp = StateSave(fp, fp->fPatIdx, status);
4661                                }
4662                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
4663                                break;
4664                            }
4665                        }
4666                    }
4667                    #endif
4668                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4669                }
4670            }
4671            break;
4672
4673
4674        case URX_STATE_SAVE:
4675            fp = StateSave(fp, opValue, status);
4676            break;
4677
4678
4679        case URX_END:
4680            // The match loop will exit via this path on a successful match,
4681            //   when we reach the end of the pattern.
4682            if (toEnd && fp->fInputIdx != fActiveLimit) {
4683                // The pattern matched, but not to the end of input.  Try some more.
4684                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4685                break;
4686            }
4687            isMatch = TRUE;
4688            goto  breakFromLoop;
4689
4690            // Start and End Capture stack frame variables are laid out out like this:
4691            //  fp->fExtra[opValue]  - The start of a completed capture group
4692            //             opValue+1 - The end   of a completed capture group
4693            //             opValue+2 - the start of a capture group whose end
4694            //                          has not yet been reached (and might not ever be).
4695        case URX_START_CAPTURE:
4696            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4697            fp->fExtra[opValue+2] = fp->fInputIdx;
4698            break;
4699
4700
4701        case URX_END_CAPTURE:
4702            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4703            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4704            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4705            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4706            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4707            break;
4708
4709
4710        case URX_DOLLAR:                   //  $, test for End of line
4711            //     or for position before new line at end of input
4712            if (fp->fInputIdx < fAnchorLimit-2) {
4713                // We are no where near the end of input.  Fail.
4714                //   This is the common case.  Keep it first.
4715                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716                break;
4717            }
4718            if (fp->fInputIdx >= fAnchorLimit) {
4719                // We really are at the end of input.  Success.
4720                fHitEnd = TRUE;
4721                fRequireEnd = TRUE;
4722                break;
4723            }
4724
4725            // If we are positioned just before a new-line that is located at the
4726            //   end of input, succeed.
4727            if (fp->fInputIdx == fAnchorLimit-1) {
4728                UChar32 c;
4729                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4730
4731                if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4732                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4733                        // At new-line at end of input. Success
4734                        fHitEnd = TRUE;
4735                        fRequireEnd = TRUE;
4736                        break;
4737                    }
4738                }
4739            } else if (fp->fInputIdx == fAnchorLimit-2 &&
4740                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4741                    fHitEnd = TRUE;
4742                    fRequireEnd = TRUE;
4743                    break;                         // At CR/LF at end of input.  Success
4744            }
4745
4746            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4747
4748            break;
4749
4750
4751        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4752            if (fp->fInputIdx >= fAnchorLimit-1) {
4753                // Either at the last character of input, or off the end.
4754                if (fp->fInputIdx == fAnchorLimit-1) {
4755                    // At last char of input.  Success if it's a new line.
4756                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4757                        fHitEnd = TRUE;
4758                        fRequireEnd = TRUE;
4759                        break;
4760                    }
4761                } else {
4762                    // Off the end of input.  Success.
4763                    fHitEnd = TRUE;
4764                    fRequireEnd = TRUE;
4765                    break;
4766                }
4767            }
4768
4769            // Not at end of input.  Back-track out.
4770            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4771            break;
4772
4773
4774        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4775            {
4776                if (fp->fInputIdx >= fAnchorLimit) {
4777                    // We really are at the end of input.  Success.
4778                    fHitEnd = TRUE;
4779                    fRequireEnd = TRUE;
4780                    break;
4781                }
4782                // If we are positioned just before a new-line, succeed.
4783                // It makes no difference where the new-line is within the input.
4784                UChar32 c = inputBuf[fp->fInputIdx];
4785                if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4786                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4787                    //  In multi-line mode, hitting a new-line just before the end of input does not
4788                    //   set the hitEnd or requireEnd flags
4789                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4790                        break;
4791                    }
4792                }
4793                // not at a new line.  Fail.
4794                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4795            }
4796            break;
4797
4798
4799        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4800            {
4801                if (fp->fInputIdx >= fAnchorLimit) {
4802                    // We really are at the end of input.  Success.
4803                    fHitEnd = TRUE;
4804                    fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4805                    break;               //   adding a new-line would not lose the match.
4806                }
4807                // If we are not positioned just before a new-line, the test fails; backtrack out.
4808                // It makes no difference where the new-line is within the input.
4809                if (inputBuf[fp->fInputIdx] != 0x0a) {
4810                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4811                }
4812            }
4813            break;
4814
4815
4816        case URX_CARET:                    //  ^, test for start of line
4817            if (fp->fInputIdx != fAnchorStart) {
4818                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4819            }
4820            break;
4821
4822
4823        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4824            {
4825                if (fp->fInputIdx == fAnchorStart) {
4826                    // We are at the start input.  Success.
4827                    break;
4828                }
4829                // Check whether character just before the current pos is a new-line
4830                //   unless we are at the end of input
4831                UChar  c = inputBuf[fp->fInputIdx - 1];
4832                if ((fp->fInputIdx < fAnchorLimit) &&
4833                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4834                    //  It's a new-line.  ^ is true.  Success.
4835                    //  TODO:  what should be done with positions between a CR and LF?
4836                    break;
4837                }
4838                // Not at the start of a line.  Fail.
4839                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4840            }
4841            break;
4842
4843
4844        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4845            {
4846                U_ASSERT(fp->fInputIdx >= fAnchorStart);
4847                if (fp->fInputIdx <= fAnchorStart) {
4848                    // We are at the start input.  Success.
4849                    break;
4850                }
4851                // Check whether character just before the current pos is a new-line
4852                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4853                UChar  c = inputBuf[fp->fInputIdx - 1];
4854                if (c != 0x0a) {
4855                    // Not at the start of a line.  Back-track out.
4856                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4857                }
4858            }
4859            break;
4860
4861        case URX_BACKSLASH_B:          // Test for word boundaries
4862            {
4863                UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4864                success ^= (opValue != 0);     // flip sense for \B
4865                if (!success) {
4866                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4867                }
4868            }
4869            break;
4870
4871
4872        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4873            {
4874                UBool success = isUWordBoundary(fp->fInputIdx);
4875                success ^= (opValue != 0);     // flip sense for \B
4876                if (!success) {
4877                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4878                }
4879            }
4880            break;
4881
4882
4883        case URX_BACKSLASH_D:            // Test for decimal digit
4884            {
4885                if (fp->fInputIdx >= fActiveLimit) {
4886                    fHitEnd = TRUE;
4887                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888                    break;
4889                }
4890
4891                UChar32 c;
4892                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4893                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4894                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4895                success ^= (opValue != 0);        // flip sense for \D
4896                if (!success) {
4897                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4898                }
4899            }
4900            break;
4901
4902
4903        case URX_BACKSLASH_G:          // Test for position at end of previous match
4904            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4905                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4906            }
4907            break;
4908
4909
4910        case URX_BACKSLASH_X:
4911        //  Match a Grapheme, as defined by Unicode TR 29.
4912        //  Differs slightly from Perl, which consumes combining marks independently
4913        //    of context.
4914        {
4915
4916            // Fail if at end of input
4917            if (fp->fInputIdx >= fActiveLimit) {
4918                fHitEnd = TRUE;
4919                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4920                break;
4921            }
4922
4923            // Examine (and consume) the current char.
4924            //   Dispatch into a little state machine, based on the char.
4925            UChar32  c;
4926            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4927            UnicodeSet **sets = fPattern->fStaticSets;
4928            if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4929            if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4930            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4931            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4932            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4933            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4934            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4935            goto GC_Extend;
4936
4937
4938
4939GC_L:
4940            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4941            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4942            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4943            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4944            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4945            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4946            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4947            goto GC_Extend;
4948
4949GC_V:
4950            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4951            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4952            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4953            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4954            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4955            goto GC_Extend;
4956
4957GC_T:
4958            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4959            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4960            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4961            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4962            goto GC_Extend;
4963
4964GC_Extend:
4965            // Combining characters are consumed here
4966            for (;;) {
4967                if (fp->fInputIdx >= fActiveLimit) {
4968                    break;
4969                }
4970                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4971                if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4972                    U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4973                    break;
4974                }
4975            }
4976            goto GC_Done;
4977
4978GC_Control:
4979            // Most control chars stand alone (don't combine with combining chars),
4980            //   except for that CR/LF sequence is a single grapheme cluster.
4981            if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4982                fp->fInputIdx++;
4983            }
4984
4985GC_Done:
4986            if (fp->fInputIdx >= fActiveLimit) {
4987                fHitEnd = TRUE;
4988            }
4989            break;
4990        }
4991
4992
4993
4994
4995        case URX_BACKSLASH_Z:          // Test for end of Input
4996            if (fp->fInputIdx < fAnchorLimit) {
4997                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4998            } else {
4999                fHitEnd = TRUE;
5000                fRequireEnd = TRUE;
5001            }
5002            break;
5003
5004
5005
5006        case URX_STATIC_SETREF:
5007            {
5008                // Test input character against one of the predefined sets
5009                //    (Word Characters, for example)
5010                // The high bit of the op value is a flag for the match polarity.
5011                //    0:   success if input char is in set.
5012                //    1:   success if input char is not in set.
5013                if (fp->fInputIdx >= fActiveLimit) {
5014                    fHitEnd = TRUE;
5015                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5016                    break;
5017                }
5018
5019                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
5020                opValue &= ~URX_NEG_SET;
5021                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5022
5023                UChar32 c;
5024                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5025                if (c < 256) {
5026                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5027                    if (s8->contains(c)) {
5028                        success = !success;
5029                    }
5030                } else {
5031                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
5032                    if (s->contains(c)) {
5033                        success = !success;
5034                    }
5035                }
5036                if (!success) {
5037                    #ifdef REGEX_SMART_BACKTRACKING
5038                    if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5039                        REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5040                        if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5041                            // Try to find it, backwards
5042                            int64_t reverseIndex = fp->fInputIdx;
5043                            U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5044                            success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset
5045                            do {
5046                                U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5047                                if (c < 256) {
5048                                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5049                                    if (s8->contains(c)) {
5050                                        success = !success;
5051                                    }
5052                                } else {
5053                                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
5054                                    if (s->contains(c)) {
5055                                        success = !success;
5056                                    }
5057                                }
5058                            } while (reverseIndex > backSearchIndex && !success);
5059
5060                            if (success) {
5061                                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5062                                fp->fInputIdx = reverseIndex;
5063                                if (fp->fInputIdx > backSearchIndex) {
5064                                    fp = StateSave(fp, fp->fPatIdx, status);
5065                                }
5066                                fp->fPatIdx++; // Skip the LOOP_C, we just did that
5067                                break;
5068                            }
5069                        }
5070                    }
5071                    #endif
5072                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5073                }
5074            }
5075            break;
5076
5077
5078        case URX_STAT_SETREF_N:
5079            {
5080                // Test input character for NOT being a member of  one of
5081                //    the predefined sets (Word Characters, for example)
5082                if (fp->fInputIdx >= fActiveLimit) {
5083                    fHitEnd = TRUE;
5084                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5085                    break;
5086                }
5087
5088                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
5089
5090                UChar32  c;
5091                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5092                if (c < 256) {
5093                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5094                    if (s8->contains(c) == FALSE) {
5095                        break;
5096                    }
5097                } else {
5098                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
5099                    if (s->contains(c) == FALSE) {
5100                        break;
5101                    }
5102                }
5103
5104                #ifdef REGEX_SMART_BACKTRACKING
5105                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5106                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5107                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5108                        // Try to find it, backwards
5109                        int64_t reverseIndex = fp->fInputIdx;
5110                        U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5111                        UBool success = FALSE;
5112                        do {
5113                            U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5114                            if (c < 256) {
5115                                Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
5116                                if (s8->contains(c) == FALSE) {
5117                                    success = TRUE;
5118                                    break;
5119                                }
5120                            } else {
5121                                const UnicodeSet *s = fPattern->fStaticSets[opValue];
5122                                if (s->contains(c) == FALSE) {
5123                                    success = TRUE;
5124                                    break;
5125                                }
5126                            }
5127                        } while (reverseIndex > backSearchIndex);
5128
5129                        if (success) {
5130                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5131                            fp->fInputIdx = reverseIndex;
5132                            if (fp->fInputIdx > backSearchIndex) {
5133                                fp = StateSave(fp, fp->fPatIdx, status);
5134                            }
5135                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
5136                            break;
5137                        }
5138                    }
5139                }
5140                #endif
5141                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5142            }
5143            break;
5144
5145
5146        case URX_SETREF:
5147            {
5148                if (fp->fInputIdx >= fActiveLimit) {
5149                    fHitEnd = TRUE;
5150                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5151                    break;
5152                }
5153
5154                U_ASSERT(opValue > 0 && opValue < sets->size());
5155
5156                // There is input left.  Pick up one char and test it for set membership.
5157                UChar32  c;
5158                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5159                if (c<256) {
5160                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5161                    if (s8->contains(c)) {
5162                        // The character is in the set.  A Match.
5163                        break;
5164                    }
5165                } else {
5166                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5167                    if (s->contains(c)) {
5168                        // The character is in the set.  A Match.
5169                        break;
5170                    }
5171                }
5172
5173                // the character wasn't in the set.
5174                #ifdef REGEX_SMART_BACKTRACKING
5175                if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5176                    REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5177                    if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5178                        // Try to find it, backwards
5179                        int64_t reverseIndex = fp->fInputIdx;
5180                        U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried
5181                        UBool success = FALSE;
5182                        do {
5183                            U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5184                            if (c < 256) {
5185                                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5186                                if (s8->contains(c)) {
5187                                    success = TRUE;
5188                                    break;
5189                                }
5190                            } else {
5191                                UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5192                                if (s->contains(c)) {
5193                                    success = TRUE;
5194                                    break;
5195                                }
5196                            }
5197                        } while (reverseIndex > backSearchIndex);
5198
5199                        if (success) {
5200                            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5201                            fp->fInputIdx = reverseIndex;
5202                            if (fp->fInputIdx > reverseIndex) {
5203                                fp = StateSave(fp, fp->fPatIdx, status);
5204                            }
5205                            fp->fPatIdx++; // Skip the LOOP_C, we just did that
5206                            break;
5207                        }
5208                    }
5209                }
5210                #endif
5211                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5212            }
5213            break;
5214
5215
5216        case URX_DOTANY:
5217            {
5218                // . matches anything, but stops at end-of-line.
5219                if (fp->fInputIdx >= fActiveLimit) {
5220                    // At end of input.  Match failed.  Backtrack out.
5221                    fHitEnd = TRUE;
5222                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5223                    break;
5224                }
5225
5226                // There is input left.  Advance over one char, unless we've hit end-of-line
5227                UChar32  c;
5228                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5229                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
5230                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
5231                    // End of line in normal mode.   . does not match.
5232                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5233                    break;
5234                }
5235            }
5236            break;
5237
5238
5239        case URX_DOTANY_ALL:
5240            {
5241                // . in dot-matches-all (including new lines) mode
5242                if (fp->fInputIdx >= fActiveLimit) {
5243                    // At end of input.  Match failed.  Backtrack out.
5244                    fHitEnd = TRUE;
5245                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5246                    break;
5247                }
5248
5249                // There is input left.  Advance over one char, except if we are
5250                //   at a cr/lf, advance over both of them.
5251                UChar32 c;
5252                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5253                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
5254                    // In the case of a CR/LF, we need to advance over both.
5255                    if (inputBuf[fp->fInputIdx] == 0x0a) {
5256                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
5257                    }
5258                }
5259            }
5260            break;
5261
5262
5263        case URX_DOTANY_UNIX:
5264            {
5265                // '.' operator, matches all, but stops at end-of-line.
5266                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
5267                if (fp->fInputIdx >= fActiveLimit) {
5268                    // At end of input.  Match failed.  Backtrack out.
5269                    fHitEnd = TRUE;
5270                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5271                    break;
5272                }
5273
5274                // There is input left.  Advance over one char, unless we've hit end-of-line
5275                UChar32 c;
5276                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5277                if (c == 0x0a) {
5278                    // End of line in normal mode.   '.' does not match the \n
5279                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5280                }
5281            }
5282            break;
5283
5284
5285        case URX_JMP:
5286            fp->fPatIdx = opValue;
5287            break;
5288
5289        case URX_FAIL:
5290            isMatch = FALSE;
5291            goto breakFromLoop;
5292
5293        case URX_JMP_SAV:
5294            U_ASSERT(opValue < fPattern->fCompiledPat->size());
5295            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5296            fp->fPatIdx = opValue;                         // Then JMP.
5297            break;
5298
5299        case URX_JMP_SAV_X:
5300            // This opcode is used with (x)+, when x can match a zero length string.
5301            // Same as JMP_SAV, except conditional on the match having made forward progress.
5302            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5303            //   data address of the input position at the start of the loop.
5304            {
5305                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5306                int32_t  stoOp = (int32_t)pat[opValue-1];
5307                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5308                int32_t  frameLoc = URX_VAL(stoOp);
5309                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5310                int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5311                U_ASSERT(prevInputIdx <= fp->fInputIdx);
5312                if (prevInputIdx < fp->fInputIdx) {
5313                    // The match did make progress.  Repeat the loop.
5314                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5315                    fp->fPatIdx = opValue;
5316                    fp->fExtra[frameLoc] = fp->fInputIdx;
5317                }
5318                // If the input position did not advance, we do nothing here,
5319                //   execution will fall out of the loop.
5320            }
5321            break;
5322
5323        case URX_CTR_INIT:
5324            {
5325                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5326                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5327
5328                // Pick up the three extra operands that CTR_INIT has, and
5329                //    skip the pattern location counter past
5330                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5331                fp->fPatIdx += 3;
5332                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5333                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5334                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5335                U_ASSERT(minCount>=0);
5336                U_ASSERT(maxCount>=minCount || maxCount==-1);
5337                U_ASSERT(loopLoc>fp->fPatIdx);
5338
5339                if (minCount == 0) {
5340                    fp = StateSave(fp, loopLoc+1, status);
5341                }
5342                if (maxCount == 0) {
5343                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5344                }
5345            }
5346            break;
5347
5348        case URX_CTR_LOOP:
5349            {
5350                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5351                int32_t initOp = (int32_t)pat[opValue];
5352                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5353                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5354                int32_t minCount  = (int32_t)pat[opValue+2];
5355                int32_t maxCount  = (int32_t)pat[opValue+3];
5356                // Increment the counter.  Note: we DIDN'T worry about counter
5357                //   overflow, since the data comes from UnicodeStrings, which
5358                //   stores its length in an int32_t. Do we have to think about
5359                //   this now that we're using UText? Probably not, since the length
5360                //   in UChar32s is still an int32_t.
5361                (*pCounter)++;
5362                U_ASSERT(*pCounter > 0);
5363                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5364                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
5365                    break;
5366                }
5367                if (*pCounter >= minCount) {
5368                    fp = StateSave(fp, fp->fPatIdx, status);
5369                }
5370                fp->fPatIdx = opValue + 4;    // Loop back.
5371            }
5372            break;
5373
5374        case URX_CTR_INIT_NG:
5375            {
5376                // Initialize a non-greedy loop
5377                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5378                fp->fExtra[opValue] = 0;       //  Set the loop counter variable to zero
5379
5380                // Pick up the three extra operands that CTR_INIT has, and
5381                //    skip the pattern location counter past
5382                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5383                fp->fPatIdx += 3;
5384                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5385                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5386                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5387                U_ASSERT(minCount>=0);
5388                U_ASSERT(maxCount>=minCount || maxCount==-1);
5389                U_ASSERT(loopLoc>fp->fPatIdx);
5390
5391                if (minCount == 0) {
5392                    if (maxCount != 0) {
5393                        fp = StateSave(fp, fp->fPatIdx, status);
5394                    }
5395                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5396                }
5397            }
5398            break;
5399
5400        case URX_CTR_LOOP_NG:
5401            {
5402                // Non-greedy {min, max} loops
5403                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5404                int32_t initOp = (int32_t)pat[opValue];
5405                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5406                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5407                int32_t minCount  = (int32_t)pat[opValue+2];
5408                int32_t maxCount  = (int32_t)pat[opValue+3];
5409                // Increment the counter.  Note: we DIDN'T worry about counter
5410                //   overflow, since the data comes from UnicodeStrings, which
5411                //   stores its length in an int32_t. Do we have to think about
5412                //   this now that we're using UText? Probably not, since the length
5413                //   in UChar32s is still an int32_t.
5414                (*pCounter)++;
5415                U_ASSERT(*pCounter > 0);
5416
5417                if ((uint64_t)*pCounter >= (uint32_t)maxCount) {
5418                    // The loop has matched the maximum permitted number of times.
5419                    //   Break out of here with no action.  Matching will
5420                    //   continue with the following pattern.
5421                    U_ASSERT(*pCounter == maxCount || maxCount == -1);
5422                    break;
5423                }
5424
5425                if (*pCounter < minCount) {
5426                    // We haven't met the minimum number of matches yet.
5427                    //   Loop back for another one.
5428                    fp->fPatIdx = opValue + 4;    // Loop back.
5429                } else {
5430                    // We do have the minimum number of matches.
5431                    //   Fall into the following pattern, but first do
5432                    //   a state save to the top of the loop, so that a failure
5433                    //   in the following pattern will try another iteration of the loop.
5434                    fp = StateSave(fp, opValue + 4, status);
5435                }
5436            }
5437            break;
5438
5439        case URX_STO_SP:
5440            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5441            fData[opValue] = fStack->size();
5442            break;
5443
5444        case URX_LD_SP:
5445            {
5446                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5447                int32_t newStackSize = (int32_t)fData[opValue];
5448                U_ASSERT(newStackSize <= fStack->size());
5449                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5450                if (newFP == (int64_t *)fp) {
5451                    break;
5452                }
5453                int32_t i;
5454                for (i=0; i<fFrameSize; i++) {
5455                    newFP[i] = ((int64_t *)fp)[i];
5456                }
5457                fp = (REStackFrame *)newFP;
5458                fStack->setSize(newStackSize);
5459            }
5460            break;
5461
5462        case URX_BACKREF:
5463        case URX_BACKREF_I:
5464            {
5465                U_ASSERT(opValue < fFrameSize);
5466                int64_t groupStartIdx = fp->fExtra[opValue];
5467                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5468                U_ASSERT(groupStartIdx <= groupEndIdx);
5469                int64_t len = groupEndIdx-groupStartIdx;
5470                if (groupStartIdx < 0) {
5471                    // This capture group has not participated in the match thus far,
5472                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5473                }
5474
5475                if (len == 0) {
5476                        //   The capture group match was of an empty string.
5477                        //   Verified by testing:  Perl matches succeed in this case, so
5478                        //   we do too.
5479                        break;
5480                    }
5481
5482                UBool  haveMatch = FALSE;
5483                if (fp->fInputIdx + len <= fActiveLimit) {
5484                    if (opType == URX_BACKREF) {
5485                        if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) {
5486                            haveMatch = TRUE;
5487                        }
5488                    } else {
5489                        if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx,
5490                                  (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) {
5491                            haveMatch = TRUE;
5492                        }
5493                    }
5494                } else {
5495                    // TODO: probably need to do a partial string comparison, and only
5496                    //       set HitEnd if the available input matched.  Ticket #6074
5497                    fHitEnd = TRUE;
5498                }
5499                if (haveMatch) {
5500                    fp->fInputIdx += len;     // Match.  Advance current input position.
5501                } else {
5502                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5503                }
5504            }
5505            break;
5506
5507        case URX_STO_INP_LOC:
5508            {
5509                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5510                fp->fExtra[opValue] = fp->fInputIdx;
5511            }
5512            break;
5513
5514        case URX_JMPX:
5515            {
5516                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5517                fp->fPatIdx += 1;
5518                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5519                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5520                int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5521                U_ASSERT(savedInputIdx <= fp->fInputIdx);
5522                if (savedInputIdx < fp->fInputIdx) {
5523                    fp->fPatIdx = opValue;                               // JMP
5524                } else {
5525                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5526                }
5527            }
5528            break;
5529
5530        case URX_LA_START:
5531            {
5532                // Entering a lookahead block.
5533                // Save Stack Ptr, Input Pos.
5534                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5535                fData[opValue]   = fStack->size();
5536                fData[opValue+1] = fp->fInputIdx;
5537                fActiveStart     = fLookStart;          // Set the match region change for
5538                fActiveLimit     = fLookLimit;          //   transparent bounds.
5539            }
5540            break;
5541
5542        case URX_LA_END:
5543            {
5544                // Leaving a look-ahead block.
5545                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5546                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5547                int32_t stackSize = fStack->size();
5548                int32_t newStackSize = (int32_t)fData[opValue];
5549                U_ASSERT(stackSize >= newStackSize);
5550                if (stackSize > newStackSize) {
5551                    // Copy the current top frame back to the new (cut back) top frame.
5552                    //   This makes the capture groups from within the look-ahead
5553                    //   expression available.
5554                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5555                    int32_t i;
5556                    for (i=0; i<fFrameSize; i++) {
5557                        newFP[i] = ((int64_t *)fp)[i];
5558                    }
5559                    fp = (REStackFrame *)newFP;
5560                    fStack->setSize(newStackSize);
5561                }
5562                fp->fInputIdx = fData[opValue+1];
5563
5564                // Restore the active region bounds in the input string; they may have
5565                //    been changed because of transparent bounds on a Region.
5566                fActiveStart = fRegionStart;
5567                fActiveLimit = fRegionLimit;
5568            }
5569            break;
5570
5571        case URX_ONECHAR_I:
5572            if (fp->fInputIdx < fActiveLimit) {
5573                UChar32 c;
5574                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5575                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5576                    break;
5577                }
5578            } else {
5579                fHitEnd = TRUE;
5580            }
5581
5582            #ifdef REGEX_SMART_BACKTRACKING
5583            if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) {
5584                REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5585                if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5586                    UBool success = FALSE;
5587                    int64_t reverseIndex = fp->fInputIdx;
5588                    UChar32 c;
5589                    while (reverseIndex > backSearchIndex) {
5590                        U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5591                        if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5592                            success = TRUE;
5593                            break;
5594                        } else if (c == U_SENTINEL) {
5595                            break;
5596                        }
5597                    }
5598                    if (success) {
5599                        fHitEnd = FALSE;
5600                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5601                        fp->fInputIdx = reverseIndex;
5602                        if (fp->fInputIdx > backSearchIndex) {
5603                            fp = StateSave(fp, fp->fPatIdx, status);
5604                        }
5605                        fp->fPatIdx++; // Skip the LOOP_C, we just did that
5606                        break;
5607                    }
5608                }
5609            }
5610            #endif
5611
5612            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5613            break;
5614
5615        case URX_STRING_I:
5616            {
5617                // Test input against a literal string.
5618                // Strings require two slots in the compiled pattern, one for the
5619                //   offset to the string text, and one for the length.
5620                const UCaseProps *csp = ucase_getSingleton();
5621                {
5622                    int32_t stringStartIdx, stringLen;
5623                    stringStartIdx = opValue;
5624
5625                    op      = (int32_t)pat[fp->fPatIdx];
5626                    fp->fPatIdx++;
5627                    opType  = URX_TYPE(op);
5628                    opValue = URX_VAL(op);
5629                    U_ASSERT(opType == URX_STRING_LEN);
5630                    stringLen = opValue;
5631
5632                    const UChar *patternChars = litText+stringStartIdx;
5633                    const UChar *patternEnd = patternChars+stringLen;
5634
5635                    const UChar *foldChars = NULL;
5636                    int32_t foldOffset, foldLength;
5637                    UChar32 c;
5638                    // BEGIN android-changed
5639                    // For ICU ticket#8824
5640                    UBool c_is_valid = FALSE;
5641
5642                    #ifdef REGEX_SMART_BACKTRACKING
5643                    int32_t originalInputIdx = fp->fInputIdx;
5644                    #endif
5645                    UBool success = TRUE;
5646
5647                    foldOffset = foldLength = 0;
5648
5649                    while (patternChars < patternEnd && success) {
5650                        if (fp->fInputIdx < fActiveLimit) {  // don't read past end of string
5651                            if(foldOffset < foldLength) {
5652                                U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5653                                c_is_valid = TRUE;
5654                            } else {
5655                                // test pre-condition of U16_NEXT: i < length
5656                                U_ASSERT(fp->fInputIdx < fActiveLimit);
5657                                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5658                                c_is_valid = TRUE;
5659                                foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5660                                if(foldLength >= 0) {
5661                                    if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5662                                        foldOffset = 0;
5663                                        U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5664                                    } else {
5665                                        c = foldLength;
5666                                        foldLength = foldOffset; // to avoid reading chars from the folding buffer
5667                                    }
5668                                }
5669                            }
5670                        } else {
5671                            c_is_valid = FALSE;
5672                        }
5673
5674                        if (fp->fInputIdx <= fActiveLimit && c_is_valid) {
5675                            if (U_IS_BMP(c)) {
5676                                success = (*patternChars == c);
5677                                patternChars += 1;
5678                            } else if (patternChars+1 < patternEnd) {
5679                                success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c));
5680                                patternChars += 2;
5681                            }
5682                        } else {
5683                            success = FALSE;
5684                            fHitEnd = TRUE;          //   TODO:  See ticket 6074
5685                        }
5686                    }
5687                    // END android-changed
5688
5689                    if (!success) {
5690                        #ifdef REGEX_SMART_BACKTRACKING
5691                        if (fp->fInputIdx > backSearchIndex && fStack->size()) {
5692                            REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize);
5693                            if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) {
5694                                // Reset to last start point
5695                                int64_t reverseIndex = originalInputIdx;
5696                                patternChars = litText+stringStartIdx;
5697
5698                                // Search backwards for a possible start
5699                                do {
5700                                    U16_PREV(inputBuf, backSearchIndex, reverseIndex, c);
5701                                    foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT);
5702                                    if(foldLength >= 0) {
5703                                        if(foldLength <= UCASE_MAX_STRING_LENGTH) {   // !!!: Does not correctly handle chars that fold to 0-length strings
5704                                            foldOffset = 0;
5705                                            U16_NEXT_UNSAFE(foldChars, foldOffset, c);
5706                                        } else {
5707                                            c = foldLength;
5708                                            foldLength = foldOffset; // to avoid reading chars from the folding buffer
5709                                        }
5710                                    }
5711
5712                                    if ((U_IS_BMP(c) && *patternChars == c) ||
5713                                           (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) {
5714                                        success = TRUE;
5715                                        break;
5716                                    }
5717                                } while (reverseIndex > backSearchIndex);
5718
5719                                // And try again
5720                                if (success) {
5721                                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5722                                    fp->fInputIdx = reverseIndex;
5723                                    if (fp->fInputIdx > backSearchIndex) {
5724                                        fp = StateSave(fp, fp->fPatIdx, status);
5725                                    }
5726                                    fp->fPatIdx++; // Skip the LOOP_C, we just did that
5727                                    break;
5728                                }
5729                            }
5730                        }
5731                        #endif
5732                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5733                    }
5734                }
5735            }
5736            break;
5737
5738        case URX_LB_START:
5739            {
5740                // Entering a look-behind block.
5741                // Save Stack Ptr, Input Pos.
5742                //   TODO:  implement transparent bounds.  Ticket #6067
5743                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5744                fData[opValue]   = fStack->size();
5745                fData[opValue+1] = fp->fInputIdx;
5746                // Init the variable containing the start index for attempted matches.
5747                fData[opValue+2] = -1;
5748                // Save input string length, then reset to pin any matches to end at
5749                //   the current position.
5750                fData[opValue+3] = fActiveLimit;
5751                fActiveLimit     = fp->fInputIdx;
5752            }
5753            break;
5754
5755
5756        case URX_LB_CONT:
5757            {
5758                // Positive Look-Behind, at top of loop checking for matches of LB expression
5759                //    at all possible input starting positions.
5760
5761                // Fetch the min and max possible match lengths.  They are the operands
5762                //   of this op in the pattern.
5763                int32_t minML = (int32_t)pat[fp->fPatIdx++];
5764                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5765                U_ASSERT(minML <= maxML);
5766                U_ASSERT(minML >= 0);
5767
5768                // Fetch (from data) the last input index where a match was attempted.
5769                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5770                int64_t  *lbStartIdx = &fData[opValue+2];
5771                if (*lbStartIdx < 0) {
5772                    // First time through loop.
5773                    *lbStartIdx = fp->fInputIdx - minML;
5774                } else {
5775                    // 2nd through nth time through the loop.
5776                    // Back up start position for match by one.
5777                    if (*lbStartIdx == 0) {
5778                        (*lbStartIdx)--;
5779                    } else {
5780                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5781                    }
5782                }
5783
5784                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5785                    // We have tried all potential match starting points without
5786                    //  getting a match.  Backtrack out, and out of the
5787                    //   Look Behind altogether.
5788                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5789                    int64_t restoreInputLen = fData[opValue+3];
5790                    U_ASSERT(restoreInputLen >= fActiveLimit);
5791                    U_ASSERT(restoreInputLen <= fInputLength);
5792                    fActiveLimit = restoreInputLen;
5793                    break;
5794                }
5795
5796                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5797                //      (successful match will fall off the end of the loop.)
5798                fp = StateSave(fp, fp->fPatIdx-3, status);
5799                fp->fInputIdx =  *lbStartIdx;
5800            }
5801            break;
5802
5803        case URX_LB_END:
5804            // End of a look-behind block, after a successful match.
5805            {
5806                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5807                if (fp->fInputIdx != fActiveLimit) {
5808                    //  The look-behind expression matched, but the match did not
5809                    //    extend all the way to the point that we are looking behind from.
5810                    //  FAIL out of here, which will take us back to the LB_CONT, which
5811                    //     will retry the match starting at another position or fail
5812                    //     the look-behind altogether, whichever is appropriate.
5813                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5814                    break;
5815                }
5816
5817                // Look-behind match is good.  Restore the orignal input string length,
5818                //   which had been truncated to pin the end of the lookbehind match to the
5819                //   position being looked-behind.
5820                int64_t originalInputLen = fData[opValue+3];
5821                U_ASSERT(originalInputLen >= fActiveLimit);
5822                U_ASSERT(originalInputLen <= fInputLength);
5823                fActiveLimit = originalInputLen;
5824            }
5825            break;
5826
5827
5828        case URX_LBN_CONT:
5829            {
5830                // Negative Look-Behind, at top of loop checking for matches of LB expression
5831                //    at all possible input starting positions.
5832
5833                // Fetch the extra parameters of this op.
5834                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5835                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5836                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5837                continueLoc = URX_VAL(continueLoc);
5838                U_ASSERT(minML <= maxML);
5839                U_ASSERT(minML >= 0);
5840                U_ASSERT(continueLoc > fp->fPatIdx);
5841
5842                // Fetch (from data) the last input index where a match was attempted.
5843                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5844                int64_t  *lbStartIdx = &fData[opValue+2];
5845                if (*lbStartIdx < 0) {
5846                    // First time through loop.
5847                    *lbStartIdx = fp->fInputIdx - minML;
5848                } else {
5849                    // 2nd through nth time through the loop.
5850                    // Back up start position for match by one.
5851                    if (*lbStartIdx == 0) {
5852                        (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5853                    } else {
5854                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5855                    }
5856                }
5857
5858                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5859                    // We have tried all potential match starting points without
5860                    //  getting a match, which means that the negative lookbehind as
5861                    //  a whole has succeeded.  Jump forward to the continue location
5862                    int64_t restoreInputLen = fData[opValue+3];
5863                    U_ASSERT(restoreInputLen >= fActiveLimit);
5864                    U_ASSERT(restoreInputLen <= fInputLength);
5865                    fActiveLimit = restoreInputLen;
5866                    fp->fPatIdx = continueLoc;
5867                    break;
5868                }
5869
5870                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5871                //      (successful match will cause a FAIL out of the loop altogether.)
5872                fp = StateSave(fp, fp->fPatIdx-4, status);
5873                fp->fInputIdx =  *lbStartIdx;
5874            }
5875            break;
5876
5877        case URX_LBN_END:
5878            // End of a negative look-behind block, after a successful match.
5879            {
5880                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5881                if (fp->fInputIdx != fActiveLimit) {
5882                    //  The look-behind expression matched, but the match did not
5883                    //    extend all the way to the point that we are looking behind from.
5884                    //  FAIL out of here, which will take us back to the LB_CONT, which
5885                    //     will retry the match starting at another position or succeed
5886                    //     the look-behind altogether, whichever is appropriate.
5887                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5888                    break;
5889                }
5890
5891                // Look-behind expression matched, which means look-behind test as
5892                //   a whole Fails
5893
5894                //   Restore the orignal input string length, which had been truncated
5895                //   inorder to pin the end of the lookbehind match
5896                //   to the position being looked-behind.
5897                int64_t originalInputLen = fData[opValue+3];
5898                U_ASSERT(originalInputLen >= fActiveLimit);
5899                U_ASSERT(originalInputLen <= fInputLength);
5900                fActiveLimit = originalInputLen;
5901
5902                // Restore original stack position, discarding any state saved
5903                //   by the successful pattern match.
5904                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5905                int32_t newStackSize = (int32_t)fData[opValue];
5906                U_ASSERT(fStack->size() > newStackSize);
5907                fStack->setSize(newStackSize);
5908
5909                //  FAIL, which will take control back to someplace
5910                //  prior to entering the look-behind test.
5911                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5912            }
5913            break;
5914
5915
5916        case URX_LOOP_SR_I:
5917            // Loop Initialization for the optimized implementation of
5918            //     [some character set]*
5919            //   This op scans through all matching input.
5920            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5921            {
5922                U_ASSERT(opValue > 0 && opValue < sets->size());
5923                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5924                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5925
5926                // Loop through input, until either the input is exhausted or
5927                //   we reach a character that is not a member of the set.
5928                int32_t ix = (int32_t)fp->fInputIdx;
5929                for (;;) {
5930                    if (ix >= fActiveLimit) {
5931                        fHitEnd = TRUE;
5932                        break;
5933                    }
5934                    UChar32   c;
5935                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
5936                    if (c<256) {
5937                        if (s8->contains(c) == FALSE) {
5938                            U16_BACK_1(inputBuf, 0, ix);
5939                            break;
5940                        }
5941                    } else {
5942                        if (s->contains(c) == FALSE) {
5943                            U16_BACK_1(inputBuf, 0, ix);
5944                            break;
5945                        }
5946                    }
5947                }
5948
5949                // If there were no matching characters, skip over the loop altogether.
5950                //   The loop doesn't run at all, a * op always succeeds.
5951                if (ix == fp->fInputIdx) {
5952                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5953                    break;
5954                }
5955
5956                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5957                //   must follow.  It's operand is the stack location
5958                //   that holds the starting input index for the match of this [set]*
5959                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5960                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5961                int32_t stackLoc = URX_VAL(loopcOp);
5962                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5963                fp->fExtra[stackLoc] = fp->fInputIdx;
5964                #ifdef REGEX_SMART_BACKTRACKING
5965                backSearchIndex = fp->fInputIdx;
5966                #endif
5967                fp->fInputIdx = ix;
5968
5969                // Save State to the URX_LOOP_C op that follows this one,
5970                //   so that match failures in the following code will return to there.
5971                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5972                fp = StateSave(fp, fp->fPatIdx, status);
5973                fp->fPatIdx++;
5974            }
5975            break;
5976
5977
5978        case URX_LOOP_DOT_I:
5979            // Loop Initialization for the optimized implementation of .*
5980            //   This op scans through all remaining input.
5981            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5982            {
5983                // Loop through input until the input is exhausted (we reach an end-of-line)
5984                // In DOTALL mode, we can just go straight to the end of the input.
5985                int32_t ix;
5986                if ((opValue & 1) == 1) {
5987                    // Dot-matches-All mode.  Jump straight to the end of the string.
5988                    ix = (int32_t)fActiveLimit;
5989                    fHitEnd = TRUE;
5990                } else {
5991                    // NOT DOT ALL mode.  Line endings do not match '.'
5992                    // Scan forward until a line ending or end of input.
5993                    ix = (int32_t)fp->fInputIdx;
5994                    for (;;) {
5995                        if (ix >= fActiveLimit) {
5996                            fHitEnd = TRUE;
5997                            break;
5998                        }
5999                        UChar32   c;
6000                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
6001                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
6002                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
6003                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
6004                                   ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
6005                                //  char is a line ending.  Put the input pos back to the
6006                                //    line ending char, and exit the scanning loop.
6007                                U16_BACK_1(inputBuf, 0, ix);
6008                                break;
6009                            }
6010                        }
6011                    }
6012                }
6013
6014                // If there were no matching characters, skip over the loop altogether.
6015                //   The loop doesn't run at all, a * op always succeeds.
6016                if (ix == fp->fInputIdx) {
6017                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
6018                    break;
6019                }
6020
6021                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
6022                //   must follow.  It's operand is the stack location
6023                //   that holds the starting input index for the match of this .*
6024                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
6025                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
6026                int32_t stackLoc = URX_VAL(loopcOp);
6027                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
6028                fp->fExtra[stackLoc] = fp->fInputIdx;
6029                #ifdef REGEX_SMART_BACKTRACKING
6030                backSearchIndex = fp->fInputIdx;
6031                #endif
6032                fp->fInputIdx = ix;
6033
6034                // Save State to the URX_LOOP_C op that follows this one,
6035                //   so that match failures in the following code will return to there.
6036                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
6037                fp = StateSave(fp, fp->fPatIdx, status);
6038                fp->fPatIdx++;
6039            }
6040            break;
6041
6042
6043        case URX_LOOP_C:
6044            {
6045                U_ASSERT(opValue>=0 && opValue<fFrameSize);
6046                backSearchIndex = (int32_t)fp->fExtra[opValue];
6047                U_ASSERT(backSearchIndex <= fp->fInputIdx);
6048                if (backSearchIndex == fp->fInputIdx) {
6049                    // We've backed up the input idx to the point that the loop started.
6050                    // The loop is done.  Leave here without saving state.
6051                    //  Subsequent failures won't come back here.
6052                    break;
6053                }
6054                // Set up for the next iteration of the loop, with input index
6055                //   backed up by one from the last time through,
6056                //   and a state save to this instruction in case the following code fails again.
6057                //   (We're going backwards because this loop emulates stack unwinding, not
6058                //    the initial scan forward.)
6059                U_ASSERT(fp->fInputIdx > 0);
6060                UChar32 prevC;
6061                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
6062
6063                if (prevC == 0x0a &&
6064                    fp->fInputIdx > backSearchIndex &&
6065                    inputBuf[fp->fInputIdx-1] == 0x0d) {
6066                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
6067                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
6068                        // .*, stepping back over CRLF pair.
6069                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
6070                    }
6071                }
6072
6073
6074                fp = StateSave(fp, fp->fPatIdx-1, status);
6075            }
6076            break;
6077
6078
6079
6080        default:
6081            // Trouble.  The compiled pattern contains an entry with an
6082            //           unrecognized type tag.
6083            U_ASSERT(FALSE);
6084        }
6085
6086        if (U_FAILURE(status)) {
6087            isMatch = FALSE;
6088            break;
6089        }
6090    }
6091
6092breakFromLoop:
6093    fMatch = isMatch;
6094    if (isMatch) {
6095        fLastMatchEnd = fMatchEnd;
6096        fMatchStart   = startIdx;
6097        fMatchEnd     = fp->fInputIdx;
6098        if (fTraceDebug) {
6099            REGEX_RUN_DEBUG_PRINTF(("Match.  start=%d   end=%d\n\n", fMatchStart, fMatchEnd));
6100        }
6101    }
6102    else
6103    {
6104        if (fTraceDebug) {
6105            REGEX_RUN_DEBUG_PRINTF(("No match\n\n"));
6106        }
6107    }
6108
6109    fFrame = fp;                // The active stack frame when the engine stopped.
6110    //   Contains the capture group results that we need to
6111    //    access later.
6112
6113    return;
6114}
6115
6116
6117UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
6118
6119U_NAMESPACE_END
6120
6121#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
6122
6123