1/*
2**************************************************************************
3*   Copyright (C) 2002-2013 International Business Machines Corporation  *
4*   and others. All rights reserved.                                     *
5**************************************************************************
6*/
7//
8//  file:  rematch.cpp
9//
10//         Contains the implementation of class RegexMatcher,
11//         which is one of the main API classes for the ICU regular expression package.
12//
13
14#include "unicode/utypes.h"
15#if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17#include "unicode/regex.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/ustring.h"
21#include "unicode/rbbi.h"
22#include "unicode/utf.h"
23#include "unicode/utf16.h"
24#include "uassert.h"
25#include "cmemory.h"
26#include "uvector.h"
27#include "uvectr32.h"
28#include "uvectr64.h"
29#include "regeximp.h"
30#include "regexst.h"
31#include "regextxt.h"
32#include "ucase.h"
33
34// #include <malloc.h>        // Needed for heapcheck testing
35
36
37// Find progress callback
38// ----------------------
39// Macro to inline test & call to ReportFindProgress().  Eliminates unnecessary function call.
40//
41#define REGEXFINDPROGRESS_INTERRUPT(pos, status)     \
42    (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE)
43
44
45// Smart Backtracking
46// ------------------
47// When a failure would go back to a LOOP_C instruction,
48// strings, characters, and setrefs scan backwards for a valid start
49// character themselves, pop the stack, and save state, emulating the
50// LOOP_C's effect but assured that the next character of input is a
51// possible matching character.
52//
53// Good idea in theory; unfortunately it only helps out a few specific
54// cases and slows the engine down a little in the rest.
55
56U_NAMESPACE_BEGIN
57
58// Default limit for the size of the back track stack, to avoid system
59//    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
60// This value puts ICU's limits higher than most other regexp implementations,
61//    which use recursion rather than the heap, and take more storage per
62//    backtrack point.
63//
64static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
65
66// Time limit counter constant.
67//   Time limits for expression evaluation are in terms of quanta of work by
68//   the engine, each of which is 10,000 state saves.
69//   This constant determines that state saves per tick number.
70static const int32_t TIMER_INITIAL_VALUE = 10000;
71
72//-----------------------------------------------------------------------------
73//
74//   Constructor and Destructor
75//
76//-----------------------------------------------------------------------------
77RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
78    fDeferredStatus = U_ZERO_ERROR;
79    init(fDeferredStatus);
80    if (U_FAILURE(fDeferredStatus)) {
81        return;
82    }
83    if (pat==NULL) {
84        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
85        return;
86    }
87    fPattern = pat;
88    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
89}
90
91
92
93RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
94                           uint32_t flags, UErrorCode &status) {
95    init(status);
96    if (U_FAILURE(status)) {
97        return;
98    }
99    UParseError    pe;
100    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
101    fPattern           = fPatternOwned;
102
103    UText inputText = UTEXT_INITIALIZER;
104    utext_openConstUnicodeString(&inputText, &input, &status);
105    init2(&inputText, status);
106    utext_close(&inputText);
107
108    fInputUniStrMaybeMutable = TRUE;
109}
110
111
112RegexMatcher::RegexMatcher(UText *regexp, UText *input,
113                           uint32_t flags, UErrorCode &status) {
114    init(status);
115    if (U_FAILURE(status)) {
116        return;
117    }
118    UParseError    pe;
119    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
120    if (U_FAILURE(status)) {
121        return;
122    }
123
124    fPattern           = fPatternOwned;
125    init2(input, status);
126}
127
128
129RegexMatcher::RegexMatcher(const UnicodeString &regexp,
130                           uint32_t flags, UErrorCode &status) {
131    init(status);
132    if (U_FAILURE(status)) {
133        return;
134    }
135    UParseError    pe;
136    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
137    if (U_FAILURE(status)) {
138        return;
139    }
140    fPattern           = fPatternOwned;
141    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
142}
143
144RegexMatcher::RegexMatcher(UText *regexp,
145                           uint32_t flags, UErrorCode &status) {
146    init(status);
147    if (U_FAILURE(status)) {
148        return;
149    }
150    UParseError    pe;
151    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
152        if (U_FAILURE(status)) {
153        return;
154    }
155
156    fPattern           = fPatternOwned;
157    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
158}
159
160
161
162
163RegexMatcher::~RegexMatcher() {
164    delete fStack;
165    if (fData != fSmallData) {
166        uprv_free(fData);
167        fData = NULL;
168    }
169    if (fPatternOwned) {
170        delete fPatternOwned;
171        fPatternOwned = NULL;
172        fPattern = NULL;
173    }
174
175    if (fInput) {
176        delete fInput;
177    }
178    if (fInputText) {
179        utext_close(fInputText);
180    }
181    if (fAltInputText) {
182        utext_close(fAltInputText);
183    }
184
185    #if UCONFIG_NO_BREAK_ITERATION==0
186    delete fWordBreakItr;
187    #endif
188}
189
190//
191//   init()   common initialization for use by all constructors.
192//            Initialize all fields, get the object into a consistent state.
193//            This must be done even when the initial status shows an error,
194//            so that the object is initialized sufficiently well for the destructor
195//            to run safely.
196//
197void RegexMatcher::init(UErrorCode &status) {
198    fPattern           = NULL;
199    fPatternOwned      = NULL;
200    fFrameSize         = 0;
201    fRegionStart       = 0;
202    fRegionLimit       = 0;
203    fAnchorStart       = 0;
204    fAnchorLimit       = 0;
205    fLookStart         = 0;
206    fLookLimit         = 0;
207    fActiveStart       = 0;
208    fActiveLimit       = 0;
209    fTransparentBounds = FALSE;
210    fAnchoringBounds   = TRUE;
211    fMatch             = FALSE;
212    fMatchStart        = 0;
213    fMatchEnd          = 0;
214    fLastMatchEnd      = -1;
215    fAppendPosition    = 0;
216    fHitEnd            = FALSE;
217    fRequireEnd        = FALSE;
218    fStack             = NULL;
219    fFrame             = NULL;
220    fTimeLimit         = 0;
221    fTime              = 0;
222    fTickCounter       = 0;
223    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
224    fCallbackFn        = NULL;
225    fCallbackContext   = NULL;
226    fFindProgressCallbackFn      = NULL;
227    fFindProgressCallbackContext = NULL;
228    fTraceDebug        = FALSE;
229    fDeferredStatus    = status;
230    fData              = fSmallData;
231    fWordBreakItr      = NULL;
232
233    fStack             = NULL;
234    fInputText         = NULL;
235    fAltInputText      = NULL;
236    fInput             = NULL;
237    fInputLength       = 0;
238    fInputUniStrMaybeMutable = FALSE;
239
240    if (U_FAILURE(status)) {
241        fDeferredStatus = status;
242    }
243}
244
245//
246//  init2()   Common initialization for use by RegexMatcher constructors, part 2.
247//            This handles the common setup to be done after the Pattern is available.
248//
249void RegexMatcher::init2(UText *input, UErrorCode &status) {
250    if (U_FAILURE(status)) {
251        fDeferredStatus = status;
252        return;
253    }
254
255    if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
256        fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
257        if (fData == NULL) {
258            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
259            return;
260        }
261    }
262
263    fStack = new UVector64(status);
264    if (fStack == NULL) {
265        status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
266        return;
267    }
268
269    reset(input);
270    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
271    if (U_FAILURE(status)) {
272        fDeferredStatus = status;
273        return;
274    }
275}
276
277
278static const UChar BACKSLASH  = 0x5c;
279static const UChar DOLLARSIGN = 0x24;
280//--------------------------------------------------------------------------------
281//
282//    appendReplacement
283//
284//--------------------------------------------------------------------------------
285RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
286                                              const UnicodeString &replacement,
287                                              UErrorCode &status) {
288    UText replacementText = UTEXT_INITIALIZER;
289
290    utext_openConstUnicodeString(&replacementText, &replacement, &status);
291    if (U_SUCCESS(status)) {
292        UText resultText = UTEXT_INITIALIZER;
293        utext_openUnicodeString(&resultText, &dest, &status);
294
295        if (U_SUCCESS(status)) {
296            appendReplacement(&resultText, &replacementText, status);
297            utext_close(&resultText);
298        }
299        utext_close(&replacementText);
300    }
301
302    return *this;
303}
304
305//
306//    appendReplacement, UText mode
307//
308RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
309                                              UText *replacement,
310                                              UErrorCode &status) {
311    if (U_FAILURE(status)) {
312        return *this;
313    }
314    if (U_FAILURE(fDeferredStatus)) {
315        status = fDeferredStatus;
316        return *this;
317    }
318    if (fMatch == FALSE) {
319        status = U_REGEX_INVALID_STATE;
320        return *this;
321    }
322
323    // Copy input string from the end of previous match to start of current match
324    int64_t  destLen = utext_nativeLength(dest);
325    if (fMatchStart > fAppendPosition) {
326        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
327            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
328                                     (int32_t)(fMatchStart-fAppendPosition), &status);
329        } else {
330            int32_t len16;
331            if (UTEXT_USES_U16(fInputText)) {
332                len16 = (int32_t)(fMatchStart-fAppendPosition);
333            } else {
334                UErrorCode lengthStatus = U_ZERO_ERROR;
335                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
336            }
337            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
338            if (inputChars == NULL) {
339                status = U_MEMORY_ALLOCATION_ERROR;
340                return *this;
341            }
342            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
343            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
344            uprv_free(inputChars);
345        }
346    }
347    fAppendPosition = fMatchEnd;
348
349
350    // scan the replacement text, looking for substitutions ($n) and \escapes.
351    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
352    //         move entire ranges not containing substitutions.
353    UTEXT_SETNATIVEINDEX(replacement, 0);
354    UChar32 c = UTEXT_NEXT32(replacement);
355    while (c != U_SENTINEL) {
356        if (c == BACKSLASH) {
357            // Backslash Escape.  Copy the following char out without further checks.
358            //                    Note:  Surrogate pairs don't need any special handling
359            //                           The second half wont be a '$' or a '\', and
360            //                           will move to the dest normally on the next
361            //                           loop iteration.
362            c = UTEXT_CURRENT32(replacement);
363            if (c == U_SENTINEL) {
364                break;
365            }
366
367            if (c==0x55/*U*/ || c==0x75/*u*/) {
368                // We have a \udddd or \Udddddddd escape sequence.
369                int32_t offset = 0;
370                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
371                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
372                if (escapedChar != (UChar32)0xFFFFFFFF) {
373                    if (U_IS_BMP(escapedChar)) {
374                        UChar c16 = (UChar)escapedChar;
375                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
376                    } else {
377                        UChar surrogate[2];
378                        surrogate[0] = U16_LEAD(escapedChar);
379                        surrogate[1] = U16_TRAIL(escapedChar);
380                        if (U_SUCCESS(status)) {
381                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
382                        }
383                    }
384                    // TODO:  Report errors for mal-formed \u escapes?
385                    //        As this is, the original sequence is output, which may be OK.
386                    if (context.lastOffset == offset) {
387                        (void)UTEXT_PREVIOUS32(replacement);
388                    } else if (context.lastOffset != offset-1) {
389                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
390                    }
391                }
392            } else {
393                (void)UTEXT_NEXT32(replacement);
394                // Plain backslash escape.  Just put out the escaped character.
395                if (U_IS_BMP(c)) {
396                    UChar c16 = (UChar)c;
397                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
398                } else {
399                    UChar surrogate[2];
400                    surrogate[0] = U16_LEAD(c);
401                    surrogate[1] = U16_TRAIL(c);
402                    if (U_SUCCESS(status)) {
403                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
404                    }
405                }
406            }
407        } else if (c != DOLLARSIGN) {
408            // Normal char, not a $.  Copy it out without further checks.
409            if (U_IS_BMP(c)) {
410                UChar c16 = (UChar)c;
411                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
412            } else {
413                UChar surrogate[2];
414                surrogate[0] = U16_LEAD(c);
415                surrogate[1] = U16_TRAIL(c);
416                if (U_SUCCESS(status)) {
417                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
418                }
419            }
420        } else {
421            // We've got a $.  Pick up a capture group number if one follows.
422            // Consume at most the number of digits necessary for the largest capture
423            // number that is valid for this pattern.
424
425            int32_t numDigits = 0;
426            int32_t groupNum  = 0;
427            UChar32 digitC;
428            for (;;) {
429                digitC = UTEXT_CURRENT32(replacement);
430                if (digitC == U_SENTINEL) {
431                    break;
432                }
433                if (u_isdigit(digitC) == FALSE) {
434                    break;
435                }
436                (void)UTEXT_NEXT32(replacement);
437                groupNum=groupNum*10 + u_charDigitValue(digitC);
438                numDigits++;
439                if (numDigits >= fPattern->fMaxCaptureDigits) {
440                    break;
441                }
442            }
443
444
445            if (numDigits == 0) {
446                // The $ didn't introduce a group number at all.
447                // Treat it as just part of the substitution text.
448                UChar c16 = DOLLARSIGN;
449                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
450            } else {
451                // Finally, append the capture group data to the destination.
452                destLen += appendGroup(groupNum, dest, status);
453                if (U_FAILURE(status)) {
454                    // Can fail if group number is out of range.
455                    break;
456                }
457            }
458        }
459
460        if (U_FAILURE(status)) {
461            break;
462        } else {
463            c = UTEXT_NEXT32(replacement);
464        }
465    }
466
467    return *this;
468}
469
470
471
472//--------------------------------------------------------------------------------
473//
474//    appendTail     Intended to be used in conjunction with appendReplacement()
475//                   To the destination string, append everything following
476//                   the last match position from the input string.
477//
478//                   Note:  Match ranges do not affect appendTail or appendReplacement
479//
480//--------------------------------------------------------------------------------
481UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
482    UErrorCode status = U_ZERO_ERROR;
483    UText resultText = UTEXT_INITIALIZER;
484    utext_openUnicodeString(&resultText, &dest, &status);
485
486    if (U_SUCCESS(status)) {
487        appendTail(&resultText, status);
488        utext_close(&resultText);
489    }
490
491    return dest;
492}
493
494//
495//   appendTail, UText mode
496//
497UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
498    if (U_FAILURE(status)) {
499        return dest;
500    }
501    if (U_FAILURE(fDeferredStatus)) {
502        status = fDeferredStatus;
503        return dest;
504    }
505
506    if (fInputLength > fAppendPosition) {
507        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
508            int64_t destLen = utext_nativeLength(dest);
509            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
510                          (int32_t)(fInputLength-fAppendPosition), &status);
511        } else {
512            int32_t len16;
513            if (UTEXT_USES_U16(fInputText)) {
514                len16 = (int32_t)(fInputLength-fAppendPosition);
515            } else {
516                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
517                status = U_ZERO_ERROR; // buffer overflow
518            }
519
520            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
521            if (inputChars == NULL) {
522                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
523            } else {
524                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
525                int64_t destLen = utext_nativeLength(dest);
526                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
527                uprv_free(inputChars);
528            }
529        }
530    }
531    return dest;
532}
533
534
535
536//--------------------------------------------------------------------------------
537//
538//   end
539//
540//--------------------------------------------------------------------------------
541int32_t RegexMatcher::end(UErrorCode &err) const {
542    return end(0, err);
543}
544
545int64_t RegexMatcher::end64(UErrorCode &err) const {
546    return end64(0, err);
547}
548
549int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
550    if (U_FAILURE(err)) {
551        return -1;
552    }
553    if (fMatch == FALSE) {
554        err = U_REGEX_INVALID_STATE;
555        return -1;
556    }
557    if (group < 0 || group > fPattern->fGroupMap->size()) {
558        err = U_INDEX_OUTOFBOUNDS_ERROR;
559        return -1;
560    }
561    int64_t e = -1;
562    if (group == 0) {
563        e = fMatchEnd;
564    } else {
565        // Get the position within the stack frame of the variables for
566        //    this capture group.
567        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
568        U_ASSERT(groupOffset < fPattern->fFrameSize);
569        U_ASSERT(groupOffset >= 0);
570        e = fFrame->fExtra[groupOffset + 1];
571    }
572
573        return e;
574}
575
576int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
577    return (int32_t)end64(group, err);
578}
579
580
581//--------------------------------------------------------------------------------
582//
583//   find()
584//
585//--------------------------------------------------------------------------------
586UBool RegexMatcher::find() {
587    // Start at the position of the last match end.  (Will be zero if the
588    //   matcher has been reset.)
589    //
590    if (U_FAILURE(fDeferredStatus)) {
591        return FALSE;
592    }
593
594    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
595        return findUsingChunk();
596    }
597
598    int64_t startPos = fMatchEnd;
599    if (startPos==0) {
600        startPos = fActiveStart;
601    }
602
603    if (fMatch) {
604        // Save the position of any previous successful match.
605        fLastMatchEnd = fMatchEnd;
606
607        if (fMatchStart == fMatchEnd) {
608            // Previous match had zero length.  Move start position up one position
609            //  to avoid sending find() into a loop on zero-length matches.
610            if (startPos >= fActiveLimit) {
611                fMatch = FALSE;
612                fHitEnd = TRUE;
613                return FALSE;
614            }
615            UTEXT_SETNATIVEINDEX(fInputText, startPos);
616            (void)UTEXT_NEXT32(fInputText);
617            startPos = UTEXT_GETNATIVEINDEX(fInputText);
618        }
619    } else {
620        if (fLastMatchEnd >= 0) {
621            // A previous find() failed to match.  Don't try again.
622            //   (without this test, a pattern with a zero-length match
623            //    could match again at the end of an input string.)
624            fHitEnd = TRUE;
625            return FALSE;
626        }
627    }
628
629
630    // Compute the position in the input string beyond which a match can not begin, because
631    //   the minimum length match would extend past the end of the input.
632    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
633    //          Be aware of possible overflows if making changes here.
634    int64_t testStartLimit;
635    if (UTEXT_USES_U16(fInputText)) {
636        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
637        if (startPos > testStartLimit) {
638            fMatch = FALSE;
639            fHitEnd = TRUE;
640            return FALSE;
641        }
642    } else {
643        // For now, let the matcher discover that it can't match on its own
644        // We don't know how long the match len is in native characters
645        testStartLimit = fActiveLimit;
646    }
647
648    UChar32  c;
649    U_ASSERT(startPos >= 0);
650
651    switch (fPattern->fStartType) {
652    case START_NO_INFO:
653        // No optimization was found.
654        //  Try a match at each input position.
655        for (;;) {
656            MatchAt(startPos, FALSE, fDeferredStatus);
657            if (U_FAILURE(fDeferredStatus)) {
658                return FALSE;
659            }
660            if (fMatch) {
661                return TRUE;
662            }
663            if (startPos >= testStartLimit) {
664                fHitEnd = TRUE;
665                return FALSE;
666            }
667            UTEXT_SETNATIVEINDEX(fInputText, startPos);
668            (void)UTEXT_NEXT32(fInputText);
669            startPos = UTEXT_GETNATIVEINDEX(fInputText);
670            // Note that it's perfectly OK for a pattern to have a zero-length
671            //   match at the end of a string, so we must make sure that the loop
672            //   runs with startPos == testStartLimit the last time through.
673            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
674                return FALSE;
675        }
676        U_ASSERT(FALSE);
677
678    case START_START:
679        // Matches are only possible at the start of the input string
680        //   (pattern begins with ^ or \A)
681        if (startPos > fActiveStart) {
682            fMatch = FALSE;
683            return FALSE;
684        }
685        MatchAt(startPos, FALSE, fDeferredStatus);
686        if (U_FAILURE(fDeferredStatus)) {
687            return FALSE;
688        }
689        return fMatch;
690
691
692    case START_SET:
693        {
694            // Match may start on any char from a pre-computed set.
695            U_ASSERT(fPattern->fMinMatchLen > 0);
696            int64_t pos;
697            UTEXT_SETNATIVEINDEX(fInputText, startPos);
698            for (;;) {
699                c = UTEXT_NEXT32(fInputText);
700                pos = UTEXT_GETNATIVEINDEX(fInputText);
701                // c will be -1 (U_SENTINEL) at end of text, in which case we
702                // skip this next block (so we don't have a negative array index)
703                // and handle end of text in the following block.
704                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
705                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
706                    MatchAt(startPos, FALSE, fDeferredStatus);
707                    if (U_FAILURE(fDeferredStatus)) {
708                        return FALSE;
709                    }
710                    if (fMatch) {
711                        return TRUE;
712                    }
713                    UTEXT_SETNATIVEINDEX(fInputText, pos);
714                }
715                if (startPos >= testStartLimit) {
716                    fMatch = FALSE;
717                    fHitEnd = TRUE;
718                    return FALSE;
719                }
720                startPos = pos;
721	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
722                    return FALSE;
723            }
724        }
725        U_ASSERT(FALSE);
726
727    case START_STRING:
728    case START_CHAR:
729        {
730            // Match starts on exactly one char.
731            U_ASSERT(fPattern->fMinMatchLen > 0);
732            UChar32 theChar = fPattern->fInitialChar;
733            int64_t pos;
734            UTEXT_SETNATIVEINDEX(fInputText, startPos);
735            for (;;) {
736                c = UTEXT_NEXT32(fInputText);
737                pos = UTEXT_GETNATIVEINDEX(fInputText);
738                if (c == theChar) {
739                    MatchAt(startPos, FALSE, fDeferredStatus);
740                    if (U_FAILURE(fDeferredStatus)) {
741                        return FALSE;
742                    }
743                    if (fMatch) {
744                        return TRUE;
745                    }
746                    UTEXT_SETNATIVEINDEX(fInputText, pos);
747                }
748                if (startPos >= testStartLimit) {
749                    fMatch = FALSE;
750                    fHitEnd = TRUE;
751                    return FALSE;
752                }
753                startPos = pos;
754	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
755                    return FALSE;
756           }
757        }
758        U_ASSERT(FALSE);
759
760    case START_LINE:
761        {
762            UChar32  c;
763            if (startPos == fAnchorStart) {
764                MatchAt(startPos, FALSE, fDeferredStatus);
765                if (U_FAILURE(fDeferredStatus)) {
766                    return FALSE;
767                }
768                if (fMatch) {
769                    return TRUE;
770                }
771                UTEXT_SETNATIVEINDEX(fInputText, startPos);
772                c = UTEXT_NEXT32(fInputText);
773                startPos = UTEXT_GETNATIVEINDEX(fInputText);
774            } else {
775                UTEXT_SETNATIVEINDEX(fInputText, startPos);
776                c = UTEXT_PREVIOUS32(fInputText);
777                UTEXT_SETNATIVEINDEX(fInputText, startPos);
778            }
779
780            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
781                for (;;) {
782                    if (c == 0x0a) {
783                            MatchAt(startPos, FALSE, fDeferredStatus);
784                            if (U_FAILURE(fDeferredStatus)) {
785                                return FALSE;
786                            }
787                            if (fMatch) {
788                                return TRUE;
789                            }
790                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
791                    }
792                    if (startPos >= testStartLimit) {
793                        fMatch = FALSE;
794                        fHitEnd = TRUE;
795                        return FALSE;
796                    }
797                    c = UTEXT_NEXT32(fInputText);
798                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
799                    // Note that it's perfectly OK for a pattern to have a zero-length
800                    //   match at the end of a string, so we must make sure that the loop
801                    //   runs with startPos == testStartLimit the last time through.
802		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
803                        return FALSE;
804                }
805            } else {
806                for (;;) {
807                    if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
808                        ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
809                            if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
810                                (void)UTEXT_NEXT32(fInputText);
811                                startPos = UTEXT_GETNATIVEINDEX(fInputText);
812                            }
813                            MatchAt(startPos, FALSE, fDeferredStatus);
814                            if (U_FAILURE(fDeferredStatus)) {
815                                return FALSE;
816                            }
817                            if (fMatch) {
818                                return TRUE;
819                            }
820                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
821                    }
822                    if (startPos >= testStartLimit) {
823                        fMatch = FALSE;
824                        fHitEnd = TRUE;
825                        return FALSE;
826                    }
827                    c = UTEXT_NEXT32(fInputText);
828                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
829                    // Note that it's perfectly OK for a pattern to have a zero-length
830                    //   match at the end of a string, so we must make sure that the loop
831                    //   runs with startPos == testStartLimit the last time through.
832		            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
833                        return FALSE;
834                }
835            }
836        }
837
838    default:
839        U_ASSERT(FALSE);
840    }
841
842    U_ASSERT(FALSE);
843    return FALSE;
844}
845
846
847
848UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
849    if (U_FAILURE(status)) {
850        return FALSE;
851    }
852    if (U_FAILURE(fDeferredStatus)) {
853        status = fDeferredStatus;
854        return FALSE;
855    }
856    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
857                                          //        This will reset the region to be the full input length.
858    if (start < 0) {
859        status = U_INDEX_OUTOFBOUNDS_ERROR;
860        return FALSE;
861    }
862
863    int64_t nativeStart = start;
864    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
865        status = U_INDEX_OUTOFBOUNDS_ERROR;
866        return FALSE;
867    }
868    fMatchEnd = nativeStart;
869    return find();
870}
871
872
873//--------------------------------------------------------------------------------
874//
875//   findUsingChunk() -- like find(), but with the advance knowledge that the
876//                       entire string is available in the UText's chunk buffer.
877//
878//--------------------------------------------------------------------------------
879UBool RegexMatcher::findUsingChunk() {
880    // Start at the position of the last match end.  (Will be zero if the
881    //   matcher has been reset.
882    //
883
884    int32_t startPos = (int32_t)fMatchEnd;
885    if (startPos==0) {
886        startPos = (int32_t)fActiveStart;
887    }
888
889    const UChar *inputBuf = fInputText->chunkContents;
890
891    if (fMatch) {
892        // Save the position of any previous successful match.
893        fLastMatchEnd = fMatchEnd;
894
895        if (fMatchStart == fMatchEnd) {
896            // Previous match had zero length.  Move start position up one position
897            //  to avoid sending find() into a loop on zero-length matches.
898            if (startPos >= fActiveLimit) {
899                fMatch = FALSE;
900                fHitEnd = TRUE;
901                return FALSE;
902            }
903            U16_FWD_1(inputBuf, startPos, fInputLength);
904        }
905    } else {
906        if (fLastMatchEnd >= 0) {
907            // A previous find() failed to match.  Don't try again.
908            //   (without this test, a pattern with a zero-length match
909            //    could match again at the end of an input string.)
910            fHitEnd = TRUE;
911            return FALSE;
912        }
913    }
914
915
916    // Compute the position in the input string beyond which a match can not begin, because
917    //   the minimum length match would extend past the end of the input.
918    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
919    //          Be aware of possible overflows if making changes here.
920    int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
921    if (startPos > testLen) {
922        fMatch = FALSE;
923        fHitEnd = TRUE;
924        return FALSE;
925    }
926
927    UChar32  c;
928    U_ASSERT(startPos >= 0);
929
930    switch (fPattern->fStartType) {
931    case START_NO_INFO:
932        // No optimization was found.
933        //  Try a match at each input position.
934        for (;;) {
935            MatchChunkAt(startPos, FALSE, fDeferredStatus);
936            if (U_FAILURE(fDeferredStatus)) {
937                return FALSE;
938            }
939            if (fMatch) {
940                return TRUE;
941            }
942            if (startPos >= testLen) {
943                fHitEnd = TRUE;
944                return FALSE;
945            }
946            U16_FWD_1(inputBuf, startPos, fActiveLimit);
947            // Note that it's perfectly OK for a pattern to have a zero-length
948            //   match at the end of a string, so we must make sure that the loop
949            //   runs with startPos == testLen the last time through.
950            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
951                return FALSE;
952        }
953        U_ASSERT(FALSE);
954
955    case START_START:
956        // Matches are only possible at the start of the input string
957        //   (pattern begins with ^ or \A)
958        if (startPos > fActiveStart) {
959            fMatch = FALSE;
960            return FALSE;
961        }
962        MatchChunkAt(startPos, FALSE, fDeferredStatus);
963        if (U_FAILURE(fDeferredStatus)) {
964            return FALSE;
965        }
966        return fMatch;
967
968
969    case START_SET:
970    {
971        // Match may start on any char from a pre-computed set.
972        U_ASSERT(fPattern->fMinMatchLen > 0);
973        for (;;) {
974            int32_t pos = startPos;
975            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
976            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
977                (c>=256 && fPattern->fInitialChars->contains(c))) {
978                MatchChunkAt(pos, FALSE, fDeferredStatus);
979                if (U_FAILURE(fDeferredStatus)) {
980                    return FALSE;
981                }
982                if (fMatch) {
983                    return TRUE;
984                }
985            }
986            if (pos >= testLen) {
987                fMatch = FALSE;
988                fHitEnd = TRUE;
989                return FALSE;
990            }
991            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
992                return FALSE;
993        }
994    }
995        U_ASSERT(FALSE);
996
997    case START_STRING:
998    case START_CHAR:
999    {
1000        // Match starts on exactly one char.
1001        U_ASSERT(fPattern->fMinMatchLen > 0);
1002        UChar32 theChar = fPattern->fInitialChar;
1003        for (;;) {
1004            int32_t pos = startPos;
1005            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1006            if (c == theChar) {
1007                MatchChunkAt(pos, FALSE, fDeferredStatus);
1008                if (U_FAILURE(fDeferredStatus)) {
1009                    return FALSE;
1010                }
1011                if (fMatch) {
1012                    return TRUE;
1013                }
1014            }
1015            if (pos >= testLen) {
1016                fMatch = FALSE;
1017                fHitEnd = TRUE;
1018                return FALSE;
1019            }
1020            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1021                return FALSE;
1022        }
1023    }
1024        U_ASSERT(FALSE);
1025
1026    case START_LINE:
1027    {
1028        UChar32  c;
1029        if (startPos == fAnchorStart) {
1030            MatchChunkAt(startPos, FALSE, fDeferredStatus);
1031            if (U_FAILURE(fDeferredStatus)) {
1032                return FALSE;
1033            }
1034            if (fMatch) {
1035                return TRUE;
1036            }
1037            U16_FWD_1(inputBuf, startPos, fActiveLimit);
1038        }
1039
1040        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1041            for (;;) {
1042                c = inputBuf[startPos-1];
1043                if (c == 0x0a) {
1044                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1045                    if (U_FAILURE(fDeferredStatus)) {
1046                        return FALSE;
1047                    }
1048                    if (fMatch) {
1049                        return TRUE;
1050                    }
1051                }
1052                if (startPos >= testLen) {
1053                    fMatch = FALSE;
1054                    fHitEnd = TRUE;
1055                    return FALSE;
1056                }
1057                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1058                // Note that it's perfectly OK for a pattern to have a zero-length
1059                //   match at the end of a string, so we must make sure that the loop
1060                //   runs with startPos == testLen the last time through.
1061	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1062                    return FALSE;
1063            }
1064        } else {
1065            for (;;) {
1066                c = inputBuf[startPos-1];
1067                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
1068                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) {
1069                    if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1070                        startPos++;
1071                    }
1072                    MatchChunkAt(startPos, FALSE, fDeferredStatus);
1073                    if (U_FAILURE(fDeferredStatus)) {
1074                        return FALSE;
1075                    }
1076                    if (fMatch) {
1077                        return TRUE;
1078                    }
1079                }
1080                if (startPos >= testLen) {
1081                    fMatch = FALSE;
1082                    fHitEnd = TRUE;
1083                    return FALSE;
1084                }
1085                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1086                // Note that it's perfectly OK for a pattern to have a zero-length
1087                //   match at the end of a string, so we must make sure that the loop
1088                //   runs with startPos == testLen the last time through.
1089	            if  (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus))
1090                    return FALSE;
1091            }
1092        }
1093    }
1094
1095    default:
1096        U_ASSERT(FALSE);
1097    }
1098
1099    U_ASSERT(FALSE);
1100    return FALSE;
1101}
1102
1103
1104
1105//--------------------------------------------------------------------------------
1106//
1107//  group()
1108//
1109//--------------------------------------------------------------------------------
1110UnicodeString RegexMatcher::group(UErrorCode &status) const {
1111    return group(0, status);
1112}
1113
1114//  Return immutable shallow clone
1115UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1116    return group(0, dest, group_len, status);
1117}
1118
1119//  Return immutable shallow clone
1120UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1121    group_len = 0;
1122    if (U_FAILURE(status)) {
1123        return dest;
1124    }
1125    if (U_FAILURE(fDeferredStatus)) {
1126        status = fDeferredStatus;
1127    } else if (fMatch == FALSE) {
1128        status = U_REGEX_INVALID_STATE;
1129    } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1130        status = U_INDEX_OUTOFBOUNDS_ERROR;
1131    }
1132
1133    if (U_FAILURE(status)) {
1134        return dest;
1135    }
1136
1137    int64_t s, e;
1138    if (groupNum == 0) {
1139        s = fMatchStart;
1140        e = fMatchEnd;
1141    } else {
1142        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1143        U_ASSERT(groupOffset < fPattern->fFrameSize);
1144        U_ASSERT(groupOffset >= 0);
1145        s = fFrame->fExtra[groupOffset];
1146        e = fFrame->fExtra[groupOffset+1];
1147    }
1148
1149    if (s < 0) {
1150        // A capture group wasn't part of the match
1151        return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1152    }
1153    U_ASSERT(s <= e);
1154    group_len = e - s;
1155
1156    dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1157    if (dest)
1158        UTEXT_SETNATIVEINDEX(dest, s);
1159    return dest;
1160}
1161
1162UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1163    UnicodeString result;
1164    if (U_FAILURE(status)) {
1165        return result;
1166    }
1167    UText resultText = UTEXT_INITIALIZER;
1168    utext_openUnicodeString(&resultText, &result, &status);
1169    group(groupNum, &resultText, status);
1170    utext_close(&resultText);
1171    return result;
1172}
1173
1174
1175//  Return deep (mutable) clone
1176//		Technology Preview (as an API), but note that the UnicodeString API is implemented
1177//		using this function.
1178UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const {
1179    if (U_FAILURE(status)) {
1180        return dest;
1181    }
1182
1183    if (U_FAILURE(fDeferredStatus)) {
1184        status = fDeferredStatus;
1185    } else if (fMatch == FALSE) {
1186        status = U_REGEX_INVALID_STATE;
1187    } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1188        status = U_INDEX_OUTOFBOUNDS_ERROR;
1189    }
1190    if (U_FAILURE(status)) {
1191        return dest;
1192    }
1193
1194    int64_t s, e;
1195    if (groupNum == 0) {
1196        s = fMatchStart;
1197        e = fMatchEnd;
1198    } else {
1199        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1200        U_ASSERT(groupOffset < fPattern->fFrameSize);
1201        U_ASSERT(groupOffset >= 0);
1202        s = fFrame->fExtra[groupOffset];
1203        e = fFrame->fExtra[groupOffset+1];
1204    }
1205
1206    if (s < 0) {
1207        // A capture group wasn't part of the match
1208        if (dest) {
1209            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status);
1210            return dest;
1211        } else {
1212            return utext_openUChars(NULL, NULL, 0, &status);
1213        }
1214    }
1215    U_ASSERT(s <= e);
1216
1217    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1218        U_ASSERT(e <= fInputLength);
1219        if (dest) {
1220            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status);
1221        } else {
1222            UText groupText = UTEXT_INITIALIZER;
1223            utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status);
1224            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1225            utext_close(&groupText);
1226        }
1227    } else {
1228        int32_t len16;
1229        if (UTEXT_USES_U16(fInputText)) {
1230            len16 = (int32_t)(e-s);
1231        } else {
1232            UErrorCode lengthStatus = U_ZERO_ERROR;
1233            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1234        }
1235        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1236        if (groupChars == NULL) {
1237            status = U_MEMORY_ALLOCATION_ERROR;
1238            return dest;
1239        }
1240        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1241
1242        if (dest) {
1243            utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status);
1244        } else {
1245            UText groupText = UTEXT_INITIALIZER;
1246            utext_openUChars(&groupText, groupChars, len16, &status);
1247            dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status);
1248            utext_close(&groupText);
1249        }
1250
1251        uprv_free(groupChars);
1252    }
1253    return dest;
1254}
1255
1256//--------------------------------------------------------------------------------
1257//
1258//  appendGroup() -- currently internal only, appends a group to a UText rather
1259//                   than replacing its contents
1260//
1261//--------------------------------------------------------------------------------
1262
1263int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1264    if (U_FAILURE(status)) {
1265        return 0;
1266    }
1267    if (U_FAILURE(fDeferredStatus)) {
1268        status = fDeferredStatus;
1269        return 0;
1270    }
1271    int64_t destLen = utext_nativeLength(dest);
1272
1273    if (fMatch == FALSE) {
1274        status = U_REGEX_INVALID_STATE;
1275        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1276    }
1277    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1278        status = U_INDEX_OUTOFBOUNDS_ERROR;
1279        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1280    }
1281
1282    int64_t s, e;
1283    if (groupNum == 0) {
1284        s = fMatchStart;
1285        e = fMatchEnd;
1286    } else {
1287        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1288        U_ASSERT(groupOffset < fPattern->fFrameSize);
1289        U_ASSERT(groupOffset >= 0);
1290        s = fFrame->fExtra[groupOffset];
1291        e = fFrame->fExtra[groupOffset+1];
1292    }
1293
1294    if (s < 0) {
1295        // A capture group wasn't part of the match
1296        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1297    }
1298    U_ASSERT(s <= e);
1299
1300    int64_t deltaLen;
1301    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1302        U_ASSERT(e <= fInputLength);
1303        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1304    } else {
1305        int32_t len16;
1306        if (UTEXT_USES_U16(fInputText)) {
1307            len16 = (int32_t)(e-s);
1308        } else {
1309            UErrorCode lengthStatus = U_ZERO_ERROR;
1310            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1311        }
1312        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1313        if (groupChars == NULL) {
1314            status = U_MEMORY_ALLOCATION_ERROR;
1315            return 0;
1316        }
1317        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1318
1319        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1320        uprv_free(groupChars);
1321    }
1322    return deltaLen;
1323}
1324
1325
1326
1327//--------------------------------------------------------------------------------
1328//
1329//  groupCount()
1330//
1331//--------------------------------------------------------------------------------
1332int32_t RegexMatcher::groupCount() const {
1333    return fPattern->fGroupMap->size();
1334}
1335
1336
1337
1338//--------------------------------------------------------------------------------
1339//
1340//  hasAnchoringBounds()
1341//
1342//--------------------------------------------------------------------------------
1343UBool RegexMatcher::hasAnchoringBounds() const {
1344    return fAnchoringBounds;
1345}
1346
1347
1348//--------------------------------------------------------------------------------
1349//
1350//  hasTransparentBounds()
1351//
1352//--------------------------------------------------------------------------------
1353UBool RegexMatcher::hasTransparentBounds() const {
1354    return fTransparentBounds;
1355}
1356
1357
1358
1359//--------------------------------------------------------------------------------
1360//
1361//  hitEnd()
1362//
1363//--------------------------------------------------------------------------------
1364UBool RegexMatcher::hitEnd() const {
1365    return fHitEnd;
1366}
1367
1368
1369//--------------------------------------------------------------------------------
1370//
1371//  input()
1372//
1373//--------------------------------------------------------------------------------
1374const UnicodeString &RegexMatcher::input() const {
1375    if (!fInput) {
1376        UErrorCode status = U_ZERO_ERROR;
1377        int32_t len16;
1378        if (UTEXT_USES_U16(fInputText)) {
1379            len16 = (int32_t)fInputLength;
1380        } else {
1381            len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1382            status = U_ZERO_ERROR; // overflow, length status
1383        }
1384        UnicodeString *result = new UnicodeString(len16, 0, 0);
1385
1386        UChar *inputChars = result->getBuffer(len16);
1387        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1388        result->releaseBuffer(len16);
1389
1390        (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1391    }
1392
1393    return *fInput;
1394}
1395
1396//--------------------------------------------------------------------------------
1397//
1398//  inputText()
1399//
1400//--------------------------------------------------------------------------------
1401UText *RegexMatcher::inputText() const {
1402    return fInputText;
1403}
1404
1405
1406//--------------------------------------------------------------------------------
1407//
1408//  getInput() -- like inputText(), but makes a clone or copies into another UText
1409//
1410//--------------------------------------------------------------------------------
1411UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1412    if (U_FAILURE(status)) {
1413        return dest;
1414    }
1415    if (U_FAILURE(fDeferredStatus)) {
1416        status = fDeferredStatus;
1417        return dest;
1418    }
1419
1420    if (dest) {
1421        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1422            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1423        } else {
1424            int32_t input16Len;
1425            if (UTEXT_USES_U16(fInputText)) {
1426                input16Len = (int32_t)fInputLength;
1427            } else {
1428                UErrorCode lengthStatus = U_ZERO_ERROR;
1429                input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1430            }
1431            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1432            if (inputChars == NULL) {
1433                return dest;
1434            }
1435
1436            status = U_ZERO_ERROR;
1437            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1438            status = U_ZERO_ERROR;
1439            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1440
1441            uprv_free(inputChars);
1442        }
1443        return dest;
1444    } else {
1445        return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1446    }
1447}
1448
1449
1450static UBool compat_SyncMutableUTextContents(UText *ut);
1451static UBool compat_SyncMutableUTextContents(UText *ut) {
1452    UBool retVal = FALSE;
1453
1454    //  In the following test, we're really only interested in whether the UText should switch
1455    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1456    //  will still point to the correct data.
1457    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1458        UnicodeString *us=(UnicodeString *)ut->context;
1459
1460        // Update to the latest length.
1461        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1462        int32_t newLength = us->length();
1463
1464        // Update the chunk description.
1465        // The buffer may have switched between stack- and heap-based.
1466        ut->chunkContents    = us->getBuffer();
1467        ut->chunkLength      = newLength;
1468        ut->chunkNativeLimit = newLength;
1469        ut->nativeIndexingLimit = newLength;
1470        retVal = TRUE;
1471    }
1472
1473    return retVal;
1474}
1475
1476//--------------------------------------------------------------------------------
1477//
1478//  lookingAt()
1479//
1480//--------------------------------------------------------------------------------
1481UBool RegexMatcher::lookingAt(UErrorCode &status) {
1482    if (U_FAILURE(status)) {
1483        return FALSE;
1484    }
1485    if (U_FAILURE(fDeferredStatus)) {
1486        status = fDeferredStatus;
1487        return FALSE;
1488    }
1489
1490    if (fInputUniStrMaybeMutable) {
1491        if (compat_SyncMutableUTextContents(fInputText)) {
1492        fInputLength = utext_nativeLength(fInputText);
1493        reset();
1494        }
1495    }
1496    else {
1497        resetPreserveRegion();
1498    }
1499    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1500        MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1501    } else {
1502        MatchAt(fActiveStart, FALSE, status);
1503    }
1504    return fMatch;
1505}
1506
1507
1508UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1509    if (U_FAILURE(status)) {
1510        return FALSE;
1511    }
1512    if (U_FAILURE(fDeferredStatus)) {
1513        status = fDeferredStatus;
1514        return FALSE;
1515    }
1516    reset();
1517
1518    if (start < 0) {
1519        status = U_INDEX_OUTOFBOUNDS_ERROR;
1520        return FALSE;
1521    }
1522
1523    if (fInputUniStrMaybeMutable) {
1524        if (compat_SyncMutableUTextContents(fInputText)) {
1525        fInputLength = utext_nativeLength(fInputText);
1526        reset();
1527        }
1528    }
1529
1530    int64_t nativeStart;
1531    nativeStart = start;
1532    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1533        status = U_INDEX_OUTOFBOUNDS_ERROR;
1534        return FALSE;
1535    }
1536
1537    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1538        MatchChunkAt((int32_t)nativeStart, FALSE, status);
1539    } else {
1540        MatchAt(nativeStart, FALSE, status);
1541    }
1542    return fMatch;
1543}
1544
1545
1546
1547//--------------------------------------------------------------------------------
1548//
1549//  matches()
1550//
1551//--------------------------------------------------------------------------------
1552UBool RegexMatcher::matches(UErrorCode &status) {
1553    if (U_FAILURE(status)) {
1554        return FALSE;
1555    }
1556    if (U_FAILURE(fDeferredStatus)) {
1557        status = fDeferredStatus;
1558        return FALSE;
1559    }
1560
1561    if (fInputUniStrMaybeMutable) {
1562        if (compat_SyncMutableUTextContents(fInputText)) {
1563        fInputLength = utext_nativeLength(fInputText);
1564        reset();
1565        }
1566    }
1567    else {
1568        resetPreserveRegion();
1569    }
1570
1571    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1572        MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1573    } else {
1574        MatchAt(fActiveStart, TRUE, status);
1575    }
1576    return fMatch;
1577}
1578
1579
1580UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1581    if (U_FAILURE(status)) {
1582        return FALSE;
1583    }
1584    if (U_FAILURE(fDeferredStatus)) {
1585        status = fDeferredStatus;
1586        return FALSE;
1587    }
1588    reset();
1589
1590    if (start < 0) {
1591        status = U_INDEX_OUTOFBOUNDS_ERROR;
1592        return FALSE;
1593    }
1594
1595    if (fInputUniStrMaybeMutable) {
1596        if (compat_SyncMutableUTextContents(fInputText)) {
1597        fInputLength = utext_nativeLength(fInputText);
1598        reset();
1599        }
1600    }
1601
1602    int64_t nativeStart;
1603    nativeStart = start;
1604    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1605        status = U_INDEX_OUTOFBOUNDS_ERROR;
1606        return FALSE;
1607    }
1608
1609    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1610        MatchChunkAt((int32_t)nativeStart, TRUE, status);
1611    } else {
1612        MatchAt(nativeStart, TRUE, status);
1613    }
1614    return fMatch;
1615}
1616
1617
1618
1619//--------------------------------------------------------------------------------
1620//
1621//    pattern
1622//
1623//--------------------------------------------------------------------------------
1624const RegexPattern &RegexMatcher::pattern() const {
1625    return *fPattern;
1626}
1627
1628
1629
1630//--------------------------------------------------------------------------------
1631//
1632//    region
1633//
1634//--------------------------------------------------------------------------------
1635RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1636    if (U_FAILURE(status)) {
1637        return *this;
1638    }
1639
1640    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1641        status = U_ILLEGAL_ARGUMENT_ERROR;
1642    }
1643
1644    int64_t nativeStart = regionStart;
1645    int64_t nativeLimit = regionLimit;
1646    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1647      status = U_ILLEGAL_ARGUMENT_ERROR;
1648    }
1649
1650    if (startIndex == -1)
1651      this->reset();
1652    else
1653      resetPreserveRegion();
1654
1655    fRegionStart = nativeStart;
1656    fRegionLimit = nativeLimit;
1657    fActiveStart = nativeStart;
1658    fActiveLimit = nativeLimit;
1659
1660    if (startIndex != -1) {
1661      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1662          status = U_INDEX_OUTOFBOUNDS_ERROR;
1663      }
1664      fMatchEnd = startIndex;
1665    }
1666
1667    if (!fTransparentBounds) {
1668        fLookStart = nativeStart;
1669        fLookLimit = nativeLimit;
1670    }
1671    if (fAnchoringBounds) {
1672        fAnchorStart = nativeStart;
1673        fAnchorLimit = nativeLimit;
1674    }
1675    return *this;
1676}
1677
1678RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1679  return region(start, limit, -1, status);
1680}
1681
1682//--------------------------------------------------------------------------------
1683//
1684//    regionEnd
1685//
1686//--------------------------------------------------------------------------------
1687int32_t RegexMatcher::regionEnd() const {
1688    return (int32_t)fRegionLimit;
1689}
1690
1691int64_t RegexMatcher::regionEnd64() const {
1692    return fRegionLimit;
1693}
1694
1695//--------------------------------------------------------------------------------
1696//
1697//    regionStart
1698//
1699//--------------------------------------------------------------------------------
1700int32_t RegexMatcher::regionStart() const {
1701    return (int32_t)fRegionStart;
1702}
1703
1704int64_t RegexMatcher::regionStart64() const {
1705    return fRegionStart;
1706}
1707
1708
1709//--------------------------------------------------------------------------------
1710//
1711//    replaceAll
1712//
1713//--------------------------------------------------------------------------------
1714UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1715    UText replacementText = UTEXT_INITIALIZER;
1716    UText resultText = UTEXT_INITIALIZER;
1717    UnicodeString resultString;
1718    if (U_FAILURE(status)) {
1719        return resultString;
1720    }
1721
1722    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1723    utext_openUnicodeString(&resultText, &resultString, &status);
1724
1725    replaceAll(&replacementText, &resultText, status);
1726
1727    utext_close(&resultText);
1728    utext_close(&replacementText);
1729
1730    return resultString;
1731}
1732
1733
1734//
1735//    replaceAll, UText mode
1736//
1737UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1738    if (U_FAILURE(status)) {
1739        return dest;
1740    }
1741    if (U_FAILURE(fDeferredStatus)) {
1742        status = fDeferredStatus;
1743        return dest;
1744    }
1745
1746    if (dest == NULL) {
1747        UnicodeString emptyString;
1748        UText empty = UTEXT_INITIALIZER;
1749
1750        utext_openUnicodeString(&empty, &emptyString, &status);
1751        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1752        utext_close(&empty);
1753    }
1754
1755    if (U_SUCCESS(status)) {
1756        reset();
1757        while (find()) {
1758            appendReplacement(dest, replacement, status);
1759            if (U_FAILURE(status)) {
1760                break;
1761            }
1762        }
1763        appendTail(dest, status);
1764    }
1765
1766    return dest;
1767}
1768
1769
1770//--------------------------------------------------------------------------------
1771//
1772//    replaceFirst
1773//
1774//--------------------------------------------------------------------------------
1775UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1776    UText replacementText = UTEXT_INITIALIZER;
1777    UText resultText = UTEXT_INITIALIZER;
1778    UnicodeString resultString;
1779
1780    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1781    utext_openUnicodeString(&resultText, &resultString, &status);
1782
1783    replaceFirst(&replacementText, &resultText, status);
1784
1785    utext_close(&resultText);
1786    utext_close(&replacementText);
1787
1788    return resultString;
1789}
1790
1791//
1792//    replaceFirst, UText mode
1793//
1794UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1795    if (U_FAILURE(status)) {
1796        return dest;
1797    }
1798    if (U_FAILURE(fDeferredStatus)) {
1799        status = fDeferredStatus;
1800        return dest;
1801    }
1802
1803    reset();
1804    if (!find()) {
1805        return getInput(dest, status);
1806    }
1807
1808    if (dest == NULL) {
1809        UnicodeString emptyString;
1810        UText empty = UTEXT_INITIALIZER;
1811
1812        utext_openUnicodeString(&empty, &emptyString, &status);
1813        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1814        utext_close(&empty);
1815    }
1816
1817    appendReplacement(dest, replacement, status);
1818    appendTail(dest, status);
1819
1820    return dest;
1821}
1822
1823
1824//--------------------------------------------------------------------------------
1825//
1826//     requireEnd
1827//
1828//--------------------------------------------------------------------------------
1829UBool RegexMatcher::requireEnd() const {
1830    return fRequireEnd;
1831}
1832
1833
1834//--------------------------------------------------------------------------------
1835//
1836//     reset
1837//
1838//--------------------------------------------------------------------------------
1839RegexMatcher &RegexMatcher::reset() {
1840    fRegionStart    = 0;
1841    fRegionLimit    = fInputLength;
1842    fActiveStart    = 0;
1843    fActiveLimit    = fInputLength;
1844    fAnchorStart    = 0;
1845    fAnchorLimit    = fInputLength;
1846    fLookStart      = 0;
1847    fLookLimit      = fInputLength;
1848    resetPreserveRegion();
1849    return *this;
1850}
1851
1852
1853
1854void RegexMatcher::resetPreserveRegion() {
1855    fMatchStart     = 0;
1856    fMatchEnd       = 0;
1857    fLastMatchEnd   = -1;
1858    fAppendPosition = 0;
1859    fMatch          = FALSE;
1860    fHitEnd         = FALSE;
1861    fRequireEnd     = FALSE;
1862    fTime           = 0;
1863    fTickCounter    = TIMER_INITIAL_VALUE;
1864    //resetStack(); // more expensive than it looks...
1865}
1866
1867
1868RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1869    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1870    if (fPattern->fNeedsAltInput) {
1871        fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1872    }
1873    fInputLength = utext_nativeLength(fInputText);
1874
1875    reset();
1876    delete fInput;
1877    fInput = NULL;
1878
1879    //  Do the following for any UnicodeString.
1880    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1881    fInputUniStrMaybeMutable = TRUE;
1882
1883    if (fWordBreakItr != NULL) {
1884#if UCONFIG_NO_BREAK_ITERATION==0
1885        UErrorCode status = U_ZERO_ERROR;
1886        fWordBreakItr->setText(fInputText, status);
1887#endif
1888    }
1889    return *this;
1890}
1891
1892
1893RegexMatcher &RegexMatcher::reset(UText *input) {
1894    if (fInputText != input) {
1895        fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1896        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1897        fInputLength = utext_nativeLength(fInputText);
1898
1899        delete fInput;
1900        fInput = NULL;
1901
1902        if (fWordBreakItr != NULL) {
1903#if UCONFIG_NO_BREAK_ITERATION==0
1904            UErrorCode status = U_ZERO_ERROR;
1905            fWordBreakItr->setText(input, status);
1906#endif
1907        }
1908    }
1909    reset();
1910    fInputUniStrMaybeMutable = FALSE;
1911
1912    return *this;
1913}
1914
1915/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1916    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1917    return *this;
1918}*/
1919
1920RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1921    if (U_FAILURE(status)) {
1922        return *this;
1923    }
1924    reset();       // Reset also resets the region to be the entire string.
1925
1926    if (position < 0 || position > fActiveLimit) {
1927        status = U_INDEX_OUTOFBOUNDS_ERROR;
1928        return *this;
1929    }
1930    fMatchEnd = position;
1931    return *this;
1932}
1933
1934
1935//--------------------------------------------------------------------------------
1936//
1937//    refresh
1938//
1939//--------------------------------------------------------------------------------
1940RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1941    if (U_FAILURE(status)) {
1942        return *this;
1943    }
1944    if (input == NULL) {
1945        status = U_ILLEGAL_ARGUMENT_ERROR;
1946        return *this;
1947    }
1948    if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1949        status = U_ILLEGAL_ARGUMENT_ERROR;
1950        return *this;
1951    }
1952    int64_t  pos = utext_getNativeIndex(fInputText);
1953    //  Shallow read-only clone of the new UText into the existing input UText
1954    fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1955    if (U_FAILURE(status)) {
1956        return *this;
1957    }
1958    utext_setNativeIndex(fInputText, pos);
1959
1960    if (fAltInputText != NULL) {
1961        pos = utext_getNativeIndex(fAltInputText);
1962        fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1963        if (U_FAILURE(status)) {
1964            return *this;
1965        }
1966        utext_setNativeIndex(fAltInputText, pos);
1967    }
1968    return *this;
1969}
1970
1971
1972
1973//--------------------------------------------------------------------------------
1974//
1975//    setTrace
1976//
1977//--------------------------------------------------------------------------------
1978void RegexMatcher::setTrace(UBool state) {
1979    fTraceDebug = state;
1980}
1981
1982
1983
1984//---------------------------------------------------------------------
1985//
1986//   split
1987//
1988//---------------------------------------------------------------------
1989int32_t  RegexMatcher::split(const UnicodeString &input,
1990        UnicodeString    dest[],
1991        int32_t          destCapacity,
1992        UErrorCode      &status)
1993{
1994    UText inputText = UTEXT_INITIALIZER;
1995    utext_openConstUnicodeString(&inputText, &input, &status);
1996    if (U_FAILURE(status)) {
1997        return 0;
1998    }
1999
2000    UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2001    if (destText == NULL) {
2002        status = U_MEMORY_ALLOCATION_ERROR;
2003        return 0;
2004    }
2005    int32_t i;
2006    for (i = 0; i < destCapacity; i++) {
2007        destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2008    }
2009
2010    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2011
2012    for (i = 0; i < destCapacity; i++) {
2013        utext_close(destText[i]);
2014    }
2015
2016    uprv_free(destText);
2017    utext_close(&inputText);
2018    return fieldCount;
2019}
2020
2021//
2022//   split, UText mode
2023//
2024int32_t  RegexMatcher::split(UText *input,
2025        UText           *dest[],
2026        int32_t          destCapacity,
2027        UErrorCode      &status)
2028{
2029    //
2030    // Check arguements for validity
2031    //
2032    if (U_FAILURE(status)) {
2033        return 0;
2034    };
2035
2036    if (destCapacity < 1) {
2037        status = U_ILLEGAL_ARGUMENT_ERROR;
2038        return 0;
2039    }
2040
2041    //
2042    // Reset for the input text
2043    //
2044    reset(input);
2045    int64_t   nextOutputStringStart = 0;
2046    if (fActiveLimit == 0) {
2047        return 0;
2048    }
2049
2050    //
2051    // Loop through the input text, searching for the delimiter pattern
2052    //
2053    int32_t i;
2054    int32_t numCaptureGroups = fPattern->fGroupMap->size();
2055    for (i=0; ; i++) {
2056        if (i>=destCapacity-1) {
2057            // There is one or zero output string left.
2058            // Fill the last output string with whatever is left from the input, then exit the loop.
2059            //  ( i will be == destCapacity if we filled the output array while processing
2060            //    capture groups of the delimiter expression, in which case we will discard the
2061            //    last capture group saved in favor of the unprocessed remainder of the
2062            //    input string.)
2063            i = destCapacity-1;
2064            if (fActiveLimit > nextOutputStringStart) {
2065                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2066                    if (dest[i]) {
2067                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2068                                      input->chunkContents+nextOutputStringStart,
2069                                      (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2070                    } else {
2071                        UText remainingText = UTEXT_INITIALIZER;
2072                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2073                                         fActiveLimit-nextOutputStringStart, &status);
2074                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2075                        utext_close(&remainingText);
2076                    }
2077                } else {
2078                    UErrorCode lengthStatus = U_ZERO_ERROR;
2079                    int32_t remaining16Length =
2080                        utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2081                    UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2082                    if (remainingChars == NULL) {
2083                        status = U_MEMORY_ALLOCATION_ERROR;
2084                        break;
2085                    }
2086
2087                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2088                    if (dest[i]) {
2089                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2090                    } else {
2091                        UText remainingText = UTEXT_INITIALIZER;
2092                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2093                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2094                        utext_close(&remainingText);
2095                    }
2096
2097                    uprv_free(remainingChars);
2098                }
2099            }
2100            break;
2101        }
2102        if (find()) {
2103            // We found another delimiter.  Move everything from where we started looking
2104            //  up until the start of the delimiter into the next output string.
2105            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2106                if (dest[i]) {
2107                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2108                                  input->chunkContents+nextOutputStringStart,
2109                                  (int32_t)(fMatchStart-nextOutputStringStart), &status);
2110                } else {
2111                    UText remainingText = UTEXT_INITIALIZER;
2112                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2113                                      fMatchStart-nextOutputStringStart, &status);
2114                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2115                    utext_close(&remainingText);
2116                }
2117            } else {
2118                UErrorCode lengthStatus = U_ZERO_ERROR;
2119                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2120                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2121                if (remainingChars == NULL) {
2122                    status = U_MEMORY_ALLOCATION_ERROR;
2123                    break;
2124                }
2125                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2126                if (dest[i]) {
2127                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2128                } else {
2129                    UText remainingText = UTEXT_INITIALIZER;
2130                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2131                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2132                    utext_close(&remainingText);
2133                }
2134
2135                uprv_free(remainingChars);
2136            }
2137            nextOutputStringStart = fMatchEnd;
2138
2139            // If the delimiter pattern has capturing parentheses, the captured
2140            //  text goes out into the next n destination strings.
2141            int32_t groupNum;
2142            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2143                if (i >= destCapacity-2) {
2144                    // Never fill the last available output string with capture group text.
2145                    // It will filled with the last field, the remainder of the
2146                    //  unsplit input text.
2147                    break;
2148                }
2149                i++;
2150                dest[i] = group(groupNum, dest[i], status);
2151            }
2152
2153            if (nextOutputStringStart == fActiveLimit) {
2154                // The delimiter was at the end of the string.  We're done, but first
2155                // we output one last empty string, for the empty field following
2156                //   the delimiter at the end of input.
2157                if (i+1 < destCapacity) {
2158                    ++i;
2159                    if (dest[i] == NULL) {
2160                        dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2161                    } else {
2162                        static UChar emptyString[] = {(UChar)0};
2163                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2164                    }
2165                }
2166                break;
2167
2168            }
2169        }
2170        else
2171        {
2172            // We ran off the end of the input while looking for the next delimiter.
2173            // All the remaining text goes into the current output string.
2174            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2175                if (dest[i]) {
2176                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2177                                  input->chunkContents+nextOutputStringStart,
2178                                  (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2179                } else {
2180                    UText remainingText = UTEXT_INITIALIZER;
2181                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2182                                     fActiveLimit-nextOutputStringStart, &status);
2183                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2184                    utext_close(&remainingText);
2185                }
2186            } else {
2187                UErrorCode lengthStatus = U_ZERO_ERROR;
2188                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2189                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2190                if (remainingChars == NULL) {
2191                    status = U_MEMORY_ALLOCATION_ERROR;
2192                    break;
2193                }
2194
2195                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2196                if (dest[i]) {
2197                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2198                } else {
2199                    UText remainingText = UTEXT_INITIALIZER;
2200                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2201                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2202                    utext_close(&remainingText);
2203                }
2204
2205                uprv_free(remainingChars);
2206            }
2207            break;
2208        }
2209        if (U_FAILURE(status)) {
2210            break;
2211        }
2212    }   // end of for loop
2213    return i+1;
2214}
2215
2216
2217//--------------------------------------------------------------------------------
2218//
2219//     start
2220//
2221//--------------------------------------------------------------------------------
2222int32_t RegexMatcher::start(UErrorCode &status) const {
2223    return start(0, status);
2224}
2225
2226int64_t RegexMatcher::start64(UErrorCode &status) const {
2227    return start64(0, status);
2228}
2229
2230//--------------------------------------------------------------------------------
2231//
2232//     start(int32_t group, UErrorCode &status)
2233//
2234//--------------------------------------------------------------------------------
2235
2236int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2237    if (U_FAILURE(status)) {
2238        return -1;
2239    }
2240    if (U_FAILURE(fDeferredStatus)) {
2241        status = fDeferredStatus;
2242        return -1;
2243    }
2244    if (fMatch == FALSE) {
2245        status = U_REGEX_INVALID_STATE;
2246        return -1;
2247    }
2248    if (group < 0 || group > fPattern->fGroupMap->size()) {
2249        status = U_INDEX_OUTOFBOUNDS_ERROR;
2250        return -1;
2251    }
2252    int64_t s;
2253    if (group == 0) {
2254        s = fMatchStart;
2255    } else {
2256        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2257        U_ASSERT(groupOffset < fPattern->fFrameSize);
2258        U_ASSERT(groupOffset >= 0);
2259        s = fFrame->fExtra[groupOffset];
2260    }
2261
2262    return s;
2263}
2264
2265
2266int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2267    return (int32_t)start64(group, status);
2268}
2269
2270//--------------------------------------------------------------------------------
2271//
2272//     useAnchoringBounds
2273//
2274//--------------------------------------------------------------------------------
2275RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2276    fAnchoringBounds = b;
2277    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2278    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2279    return *this;
2280}
2281
2282
2283//--------------------------------------------------------------------------------
2284//
2285//     useTransparentBounds
2286//
2287//--------------------------------------------------------------------------------
2288RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2289    fTransparentBounds = b;
2290    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2291    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2292    return *this;
2293}
2294
2295//--------------------------------------------------------------------------------
2296//
2297//     setTimeLimit
2298//
2299//--------------------------------------------------------------------------------
2300void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2301    if (U_FAILURE(status)) {
2302        return;
2303    }
2304    if (U_FAILURE(fDeferredStatus)) {
2305        status = fDeferredStatus;
2306        return;
2307    }
2308    if (limit < 0) {
2309        status = U_ILLEGAL_ARGUMENT_ERROR;
2310        return;
2311    }
2312    fTimeLimit = limit;
2313}
2314
2315
2316//--------------------------------------------------------------------------------
2317//
2318//     getTimeLimit
2319//
2320//--------------------------------------------------------------------------------
2321int32_t RegexMatcher::getTimeLimit() const {
2322    return fTimeLimit;
2323}
2324
2325
2326//--------------------------------------------------------------------------------
2327//
2328//     setStackLimit
2329//
2330//--------------------------------------------------------------------------------
2331void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2332    if (U_FAILURE(status)) {
2333        return;
2334    }
2335    if (U_FAILURE(fDeferredStatus)) {
2336        status = fDeferredStatus;
2337        return;
2338    }
2339    if (limit < 0) {
2340        status = U_ILLEGAL_ARGUMENT_ERROR;
2341        return;
2342    }
2343
2344    // Reset the matcher.  This is needed here in case there is a current match
2345    //    whose final stack frame (containing the match results, pointed to by fFrame)
2346    //    would be lost by resizing to a smaller stack size.
2347    reset();
2348
2349    if (limit == 0) {
2350        // Unlimited stack expansion
2351        fStack->setMaxCapacity(0);
2352    } else {
2353        // Change the units of the limit  from bytes to ints, and bump the size up
2354        //   to be big enough to hold at least one stack frame for the pattern,
2355        //   if it isn't there already.
2356        int32_t adjustedLimit = limit / sizeof(int32_t);
2357        if (adjustedLimit < fPattern->fFrameSize) {
2358            adjustedLimit = fPattern->fFrameSize;
2359        }
2360        fStack->setMaxCapacity(adjustedLimit);
2361    }
2362    fStackLimit = limit;
2363}
2364
2365
2366//--------------------------------------------------------------------------------
2367//
2368//     getStackLimit
2369//
2370//--------------------------------------------------------------------------------
2371int32_t RegexMatcher::getStackLimit() const {
2372    return fStackLimit;
2373}
2374
2375
2376//--------------------------------------------------------------------------------
2377//
2378//     setMatchCallback
2379//
2380//--------------------------------------------------------------------------------
2381void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2382                                    const void              *context,
2383                                    UErrorCode              &status) {
2384    if (U_FAILURE(status)) {
2385        return;
2386    }
2387    fCallbackFn = callback;
2388    fCallbackContext = context;
2389}
2390
2391
2392//--------------------------------------------------------------------------------
2393//
2394//     getMatchCallback
2395//
2396//--------------------------------------------------------------------------------
2397void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2398                                  const void              *&context,
2399                                  UErrorCode              &status) {
2400    if (U_FAILURE(status)) {
2401       return;
2402    }
2403    callback = fCallbackFn;
2404    context  = fCallbackContext;
2405}
2406
2407
2408//--------------------------------------------------------------------------------
2409//
2410//     setMatchCallback
2411//
2412//--------------------------------------------------------------------------------
2413void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2414                                                const void                      *context,
2415                                                UErrorCode                      &status) {
2416    if (U_FAILURE(status)) {
2417        return;
2418    }
2419    fFindProgressCallbackFn = callback;
2420    fFindProgressCallbackContext = context;
2421}
2422
2423
2424//--------------------------------------------------------------------------------
2425//
2426//     getMatchCallback
2427//
2428//--------------------------------------------------------------------------------
2429void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2430                                                const void                    *&context,
2431                                                UErrorCode                    &status) {
2432    if (U_FAILURE(status)) {
2433       return;
2434    }
2435    callback = fFindProgressCallbackFn;
2436    context  = fFindProgressCallbackContext;
2437}
2438
2439
2440//================================================================================
2441//
2442//    Code following this point in this file is the internal
2443//    Match Engine Implementation.
2444//
2445//================================================================================
2446
2447
2448//--------------------------------------------------------------------------------
2449//
2450//   resetStack
2451//           Discard any previous contents of the state save stack, and initialize a
2452//           new stack frame to all -1.  The -1s are needed for capture group limits,
2453//           where they indicate that a group has not yet matched anything.
2454//--------------------------------------------------------------------------------
2455REStackFrame *RegexMatcher::resetStack() {
2456    // Discard any previous contents of the state save stack, and initialize a
2457    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2458    //  where they indicate that a group has not yet matched anything.
2459    fStack->removeAllElements();
2460
2461    REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2462    int32_t i;
2463    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2464        iFrame->fExtra[i] = -1;
2465    }
2466    return iFrame;
2467}
2468
2469
2470
2471//--------------------------------------------------------------------------------
2472//
2473//   isWordBoundary
2474//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2475//                     For us,
2476//                       If the current char is a combining mark,
2477//                          \b is FALSE.
2478//                       Else Scan backwards to the first non-combining char.
2479//                            We are at a boundary if the this char and the original chars are
2480//                               opposite in membership in \w set
2481//
2482//          parameters:   pos   - the current position in the input buffer
2483//
2484//              TODO:  double-check edge cases at region boundaries.
2485//
2486//--------------------------------------------------------------------------------
2487UBool RegexMatcher::isWordBoundary(int64_t pos) {
2488    UBool isBoundary = FALSE;
2489    UBool cIsWord    = FALSE;
2490
2491    if (pos >= fLookLimit) {
2492        fHitEnd = TRUE;
2493    } else {
2494        // Determine whether char c at current position is a member of the word set of chars.
2495        // If we're off the end of the string, behave as though we're not at a word char.
2496        UTEXT_SETNATIVEINDEX(fInputText, pos);
2497        UChar32  c = UTEXT_CURRENT32(fInputText);
2498        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2499            // Current char is a combining one.  Not a boundary.
2500            return FALSE;
2501        }
2502        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2503    }
2504
2505    // Back up until we come to a non-combining char, determine whether
2506    //  that char is a word char.
2507    UBool prevCIsWord = FALSE;
2508    for (;;) {
2509        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2510            break;
2511        }
2512        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2513        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2514              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2515            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2516            break;
2517        }
2518    }
2519    isBoundary = cIsWord ^ prevCIsWord;
2520    return isBoundary;
2521}
2522
2523UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2524    UBool isBoundary = FALSE;
2525    UBool cIsWord    = FALSE;
2526
2527    const UChar *inputBuf = fInputText->chunkContents;
2528
2529    if (pos >= fLookLimit) {
2530        fHitEnd = TRUE;
2531    } else {
2532        // Determine whether char c at current position is a member of the word set of chars.
2533        // If we're off the end of the string, behave as though we're not at a word char.
2534        UChar32 c;
2535        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2536        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2537            // Current char is a combining one.  Not a boundary.
2538            return FALSE;
2539        }
2540        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2541    }
2542
2543    // Back up until we come to a non-combining char, determine whether
2544    //  that char is a word char.
2545    UBool prevCIsWord = FALSE;
2546    for (;;) {
2547        if (pos <= fLookStart) {
2548            break;
2549        }
2550        UChar32 prevChar;
2551        U16_PREV(inputBuf, fLookStart, pos, prevChar);
2552        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2553              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2554            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2555            break;
2556        }
2557    }
2558    isBoundary = cIsWord ^ prevCIsWord;
2559    return isBoundary;
2560}
2561
2562//--------------------------------------------------------------------------------
2563//
2564//   isUWordBoundary
2565//
2566//         Test for a word boundary using RBBI word break.
2567//
2568//          parameters:   pos   - the current position in the input buffer
2569//
2570//--------------------------------------------------------------------------------
2571UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2572    UBool       returnVal = FALSE;
2573#if UCONFIG_NO_BREAK_ITERATION==0
2574
2575    // If we haven't yet created a break iterator for this matcher, do it now.
2576    if (fWordBreakItr == NULL) {
2577        fWordBreakItr =
2578            (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2579        if (U_FAILURE(fDeferredStatus)) {
2580            return FALSE;
2581        }
2582        fWordBreakItr->setText(fInputText, fDeferredStatus);
2583    }
2584
2585    if (pos >= fLookLimit) {
2586        fHitEnd = TRUE;
2587        returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2588                            //    words are not boundaries.  All non-word chars stand by themselves,
2589                            //    with word boundaries on both sides.
2590    } else {
2591        if (!UTEXT_USES_U16(fInputText)) {
2592            // !!!: Would like a better way to do this!
2593            UErrorCode status = U_ZERO_ERROR;
2594            pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2595        }
2596        returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2597    }
2598#endif
2599    return   returnVal;
2600}
2601
2602//--------------------------------------------------------------------------------
2603//
2604//   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2605//                     saves. Increment the "time" counter, and call the
2606//                     user callback function if there is one installed.
2607//
2608//                     If the match operation needs to be aborted, either for a time-out
2609//                     or because the user callback asked for it, just set an error status.
2610//                     The engine will pick that up and stop in its outer loop.
2611//
2612//--------------------------------------------------------------------------------
2613void RegexMatcher::IncrementTime(UErrorCode &status) {
2614    fTickCounter = TIMER_INITIAL_VALUE;
2615    fTime++;
2616    if (fCallbackFn != NULL) {
2617        if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2618            status = U_REGEX_STOPPED_BY_CALLER;
2619            return;
2620        }
2621    }
2622    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2623        status = U_REGEX_TIME_OUT;
2624    }
2625}
2626
2627//--------------------------------------------------------------------------------
2628//
2629//   ReportFindProgress     This function is called once for each advance in the target
2630//                          string from the find() function, and calls the user progress callback
2631//                          function if there is one installed.
2632//
2633//                          NOTE:
2634//
2635//                          If the match operation needs to be aborted because the user
2636//                          callback asked for it, just set an error status.
2637//                          The engine will pick that up and stop in its outer loop.
2638//
2639//--------------------------------------------------------------------------------
2640UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) {
2641    if (fFindProgressCallbackFn != NULL) {
2642        if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) {
2643            status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/;
2644            return FALSE;
2645        }
2646    }
2647    return TRUE;
2648}
2649
2650//--------------------------------------------------------------------------------
2651//
2652//   StateSave
2653//       Make a new stack frame, initialized as a copy of the current stack frame.
2654//       Set the pattern index in the original stack frame from the operand value
2655//       in the opcode.  Execution of the engine continues with the state in
2656//       the newly created stack frame
2657//
2658//       Note that reserveBlock() may grow the stack, resulting in the
2659//       whole thing being relocated in memory.
2660//
2661//    Parameters:
2662//       fp           The top frame pointer when called.  At return, a new
2663//                    fame will be present
2664//       savePatIdx   An index into the compiled pattern.  Goes into the original
2665//                    (not new) frame.  If execution ever back-tracks out of the
2666//                    new frame, this will be where we continue from in the pattern.
2667//    Return
2668//                    The new frame pointer.
2669//
2670//--------------------------------------------------------------------------------
2671inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2672    // push storage for a new frame.
2673    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2674    if (newFP == NULL) {
2675        // Failure on attempted stack expansion.
2676        //   Stack function set some other error code, change it to a more
2677        //   specific one for regular expressions.
2678        status = U_REGEX_STACK_OVERFLOW;
2679        // We need to return a writable stack frame, so just return the
2680        //    previous frame.  The match operation will stop quickly
2681        //    because of the error status, after which the frame will never
2682        //    be looked at again.
2683        return fp;
2684    }
2685    fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2686
2687    // New stack frame = copy of old top frame.
2688    int64_t *source = (int64_t *)fp;
2689    int64_t *dest   = newFP;
2690    for (;;) {
2691        *dest++ = *source++;
2692        if (source == newFP) {
2693            break;
2694        }
2695    }
2696
2697    fTickCounter--;
2698    if (fTickCounter <= 0) {
2699       IncrementTime(status);    // Re-initializes fTickCounter
2700    }
2701    fp->fPatIdx = savePatIdx;
2702    return (REStackFrame *)newFP;
2703}
2704
2705
2706//--------------------------------------------------------------------------------
2707//
2708//   MatchAt      This is the actual matching engine.
2709//
2710//                  startIdx:    begin matching a this index.
2711//                  toEnd:       if true, match must extend to end of the input region
2712//
2713//--------------------------------------------------------------------------------
2714void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2715    UBool       isMatch  = FALSE;      // True if the we have a match.
2716
2717    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2718
2719    int32_t     op;                    // Operation from the compiled pattern, split into
2720    int32_t     opType;                //    the opcode
2721    int32_t     opValue;               //    and the operand value.
2722
2723#ifdef REGEX_RUN_DEBUG
2724    if (fTraceDebug)
2725    {
2726        printf("MatchAt(startIdx=%ld)\n", startIdx);
2727        printf("Original Pattern: ");
2728        UChar32 c = utext_next32From(fPattern->fPattern, 0);
2729        while (c != U_SENTINEL) {
2730            if (c<32 || c>256) {
2731                c = '.';
2732            }
2733            printf("%c", c);
2734
2735            c = UTEXT_NEXT32(fPattern->fPattern);
2736        }
2737        printf("\n");
2738        printf("Input String: ");
2739        c = utext_next32From(fInputText, 0);
2740        while (c != U_SENTINEL) {
2741            if (c<32 || c>256) {
2742                c = '.';
2743            }
2744            printf("%c", c);
2745
2746            c = UTEXT_NEXT32(fInputText);
2747        }
2748        printf("\n");
2749        printf("\n");
2750    }
2751#endif
2752
2753    if (U_FAILURE(status)) {
2754        return;
2755    }
2756
2757    //  Cache frequently referenced items from the compiled pattern
2758    //
2759    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2760
2761    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2762    UVector             *sets          = fPattern->fSets;
2763
2764    fFrameSize = fPattern->fFrameSize;
2765    REStackFrame        *fp            = resetStack();
2766
2767    fp->fPatIdx   = 0;
2768    fp->fInputIdx = startIdx;
2769
2770    // Zero out the pattern's static data
2771    int32_t i;
2772    for (i = 0; i<fPattern->fDataSize; i++) {
2773        fData[i] = 0;
2774    }
2775
2776    //
2777    //  Main loop for interpreting the compiled pattern.
2778    //  One iteration of the loop per pattern operation performed.
2779    //
2780    for (;;) {
2781        op      = (int32_t)pat[fp->fPatIdx];
2782        opType  = URX_TYPE(op);
2783        opValue = URX_VAL(op);
2784#ifdef REGEX_RUN_DEBUG
2785        if (fTraceDebug) {
2786            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2787            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2788                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2789            fPattern->dumpOp(fp->fPatIdx);
2790        }
2791#endif
2792        fp->fPatIdx++;
2793
2794        switch (opType) {
2795
2796
2797        case URX_NOP:
2798            break;
2799
2800
2801        case URX_BACKTRACK:
2802            // Force a backtrack.  In some circumstances, the pattern compiler
2803            //   will notice that the pattern can't possibly match anything, and will
2804            //   emit one of these at that point.
2805            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2806            break;
2807
2808
2809        case URX_ONECHAR:
2810            if (fp->fInputIdx < fActiveLimit) {
2811                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2812                UChar32 c = UTEXT_NEXT32(fInputText);
2813                if (c == opValue) {
2814                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2815                    break;
2816                }
2817            } else {
2818                fHitEnd = TRUE;
2819            }
2820            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2821            break;
2822
2823
2824        case URX_STRING:
2825            {
2826                // Test input against a literal string.
2827                // Strings require two slots in the compiled pattern, one for the
2828                //   offset to the string text, and one for the length.
2829
2830                int32_t   stringStartIdx = opValue;
2831                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2832                fp->fPatIdx++;
2833                opType    = URX_TYPE(op);
2834                int32_t stringLen = URX_VAL(op);
2835                U_ASSERT(opType == URX_STRING_LEN);
2836                U_ASSERT(stringLen >= 2);
2837
2838                const UChar *patternString = litText+stringStartIdx;
2839                int32_t patternStringIndex = 0;
2840                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2841                UChar32 inputChar;
2842                UChar32 patternChar;
2843                UBool success = TRUE;
2844                while (patternStringIndex < stringLen) {
2845                    if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2846                        success = FALSE;
2847                        fHitEnd = TRUE;
2848                        break;
2849                    }
2850                    inputChar = UTEXT_NEXT32(fInputText);
2851                    U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2852                    if (patternChar != inputChar) {
2853                        success = FALSE;
2854                        break;
2855                    }
2856                }
2857
2858                if (success) {
2859                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2860                } else {
2861                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2862                }
2863            }
2864            break;
2865
2866
2867        case URX_STATE_SAVE:
2868            fp = StateSave(fp, opValue, status);
2869            break;
2870
2871
2872        case URX_END:
2873            // The match loop will exit via this path on a successful match,
2874            //   when we reach the end of the pattern.
2875            if (toEnd && fp->fInputIdx != fActiveLimit) {
2876                // The pattern matched, but not to the end of input.  Try some more.
2877                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2878                break;
2879            }
2880            isMatch = TRUE;
2881            goto  breakFromLoop;
2882
2883        // Start and End Capture stack frame variables are laid out out like this:
2884            //  fp->fExtra[opValue]  - The start of a completed capture group
2885            //             opValue+1 - The end   of a completed capture group
2886            //             opValue+2 - the start of a capture group whose end
2887            //                          has not yet been reached (and might not ever be).
2888        case URX_START_CAPTURE:
2889            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2890            fp->fExtra[opValue+2] = fp->fInputIdx;
2891            break;
2892
2893
2894        case URX_END_CAPTURE:
2895            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2896            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2897            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2898            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2899            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2900            break;
2901
2902
2903        case URX_DOLLAR:                   //  $, test for End of line
2904                                           //     or for position before new line at end of input
2905            {
2906                if (fp->fInputIdx >= fAnchorLimit) {
2907                    // We really are at the end of input.  Success.
2908                    fHitEnd = TRUE;
2909                    fRequireEnd = TRUE;
2910                    break;
2911                }
2912
2913                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2914
2915                // If we are positioned just before a new-line that is located at the
2916                //   end of input, succeed.
2917                UChar32 c = UTEXT_NEXT32(fInputText);
2918                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2919                    if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
2920                        // If not in the middle of a CR/LF sequence
2921                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2922                            // At new-line at end of input. Success
2923                            fHitEnd = TRUE;
2924                            fRequireEnd = TRUE;
2925
2926                            break;
2927                        }
2928                    }
2929                } else {
2930                    UChar32 nextC = UTEXT_NEXT32(fInputText);
2931                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2932                        fHitEnd = TRUE;
2933                        fRequireEnd = TRUE;
2934                        break;                         // At CR/LF at end of input.  Success
2935                    }
2936                }
2937
2938                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2939            }
2940            break;
2941
2942
2943         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
2944            if (fp->fInputIdx >= fAnchorLimit) {
2945                // Off the end of input.  Success.
2946                fHitEnd = TRUE;
2947                fRequireEnd = TRUE;
2948                break;
2949            } else {
2950                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2951                UChar32 c = UTEXT_NEXT32(fInputText);
2952                // Either at the last character of input, or off the end.
2953                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2954                    fHitEnd = TRUE;
2955                    fRequireEnd = TRUE;
2956                    break;
2957                }
2958            }
2959
2960            // Not at end of input.  Back-track out.
2961            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2962            break;
2963
2964
2965         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
2966             {
2967                 if (fp->fInputIdx >= fAnchorLimit) {
2968                     // We really are at the end of input.  Success.
2969                     fHitEnd = TRUE;
2970                     fRequireEnd = TRUE;
2971                     break;
2972                 }
2973                 // If we are positioned just before a new-line, succeed.
2974                 // It makes no difference where the new-line is within the input.
2975                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2976                 UChar32 c = UTEXT_CURRENT32(fInputText);
2977                 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
2978                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
2979                     //  In multi-line mode, hitting a new-line just before the end of input does not
2980                     //   set the hitEnd or requireEnd flags
2981                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
2982                        break;
2983                     }
2984                 }
2985                 // not at a new line.  Fail.
2986                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2987             }
2988             break;
2989
2990
2991         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
2992             {
2993                 if (fp->fInputIdx >= fAnchorLimit) {
2994                     // We really are at the end of input.  Success.
2995                     fHitEnd = TRUE;
2996                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
2997                     break;               //   adding a new-line would not lose the match.
2998                 }
2999                 // If we are not positioned just before a new-line, the test fails; backtrack out.
3000                 // It makes no difference where the new-line is within the input.
3001                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3002                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3003                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3004                 }
3005             }
3006             break;
3007
3008
3009       case URX_CARET:                    //  ^, test for start of line
3010            if (fp->fInputIdx != fAnchorStart) {
3011                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3012            }
3013            break;
3014
3015
3016       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3017           {
3018               if (fp->fInputIdx == fAnchorStart) {
3019                   // We are at the start input.  Success.
3020                   break;
3021               }
3022               // Check whether character just before the current pos is a new-line
3023               //   unless we are at the end of input
3024               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3025               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3026               if ((fp->fInputIdx < fAnchorLimit) &&
3027                   ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3028                   //  It's a new-line.  ^ is true.  Success.
3029                   //  TODO:  what should be done with positions between a CR and LF?
3030                   break;
3031               }
3032               // Not at the start of a line.  Fail.
3033               fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3034           }
3035           break;
3036
3037
3038       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3039           {
3040               U_ASSERT(fp->fInputIdx >= fAnchorStart);
3041               if (fp->fInputIdx <= fAnchorStart) {
3042                   // We are at the start input.  Success.
3043                   break;
3044               }
3045               // Check whether character just before the current pos is a new-line
3046               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3047               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3048               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3049               if (c != 0x0a) {
3050                   // Not at the start of a line.  Back-track out.
3051                   fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3052               }
3053           }
3054           break;
3055
3056        case URX_BACKSLASH_B:          // Test for word boundaries
3057            {
3058                UBool success = isWordBoundary(fp->fInputIdx);
3059                success ^= (UBool)(opValue != 0);     // flip sense for \B
3060                if (!success) {
3061                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3062                }
3063            }
3064            break;
3065
3066
3067        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3068            {
3069                UBool success = isUWordBoundary(fp->fInputIdx);
3070                success ^= (UBool)(opValue != 0);     // flip sense for \B
3071                if (!success) {
3072                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3073                }
3074            }
3075            break;
3076
3077
3078        case URX_BACKSLASH_D:            // Test for decimal digit
3079            {
3080                if (fp->fInputIdx >= fActiveLimit) {
3081                    fHitEnd = TRUE;
3082                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3083                    break;
3084                }
3085
3086                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3087
3088                UChar32 c = UTEXT_NEXT32(fInputText);
3089                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3090                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3091                success ^= (UBool)(opValue != 0);        // flip sense for \D
3092                if (success) {
3093                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3094                } else {
3095                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3096                }
3097            }
3098            break;
3099
3100
3101        case URX_BACKSLASH_G:          // Test for position at end of previous match
3102            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3103                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3104            }
3105            break;
3106
3107
3108        case URX_BACKSLASH_X:
3109            //  Match a Grapheme, as defined by Unicode TR 29.
3110            //  Differs slightly from Perl, which consumes combining marks independently
3111            //    of context.
3112            {
3113
3114                // Fail if at end of input
3115                if (fp->fInputIdx >= fActiveLimit) {
3116                    fHitEnd = TRUE;
3117                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3118                    break;
3119                }
3120
3121                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3122
3123                // Examine (and consume) the current char.
3124                //   Dispatch into a little state machine, based on the char.
3125                UChar32  c;
3126                c = UTEXT_NEXT32(fInputText);
3127                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3128                UnicodeSet **sets = fPattern->fStaticSets;
3129                if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3130                if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3131                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3132                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3133                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3134                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3135                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3136                goto GC_Extend;
3137
3138
3139
3140GC_L:
3141                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3142                c = UTEXT_NEXT32(fInputText);
3143                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3144                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3145                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3146                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3147                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3148                (void)UTEXT_PREVIOUS32(fInputText);
3149                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3150                goto GC_Extend;
3151
3152GC_V:
3153                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3154                c = UTEXT_NEXT32(fInputText);
3155                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3156                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3157                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3158                (void)UTEXT_PREVIOUS32(fInputText);
3159                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3160                goto GC_Extend;
3161
3162GC_T:
3163                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3164                c = UTEXT_NEXT32(fInputText);
3165                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3166                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3167                (void)UTEXT_PREVIOUS32(fInputText);
3168                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3169                goto GC_Extend;
3170
3171GC_Extend:
3172                // Combining characters are consumed here
3173                for (;;) {
3174                    if (fp->fInputIdx >= fActiveLimit) {
3175                        break;
3176                    }
3177                    c = UTEXT_CURRENT32(fInputText);
3178                    if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3179                        break;
3180                    }
3181                    (void)UTEXT_NEXT32(fInputText);
3182                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3183                }
3184                goto GC_Done;
3185
3186GC_Control:
3187                // Most control chars stand alone (don't combine with combining chars),
3188                //   except for that CR/LF sequence is a single grapheme cluster.
3189                if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3190                    c = UTEXT_NEXT32(fInputText);
3191                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3192                }
3193
3194GC_Done:
3195                if (fp->fInputIdx >= fActiveLimit) {
3196                    fHitEnd = TRUE;
3197                }
3198                break;
3199            }
3200
3201
3202
3203
3204        case URX_BACKSLASH_Z:          // Test for end of Input
3205            if (fp->fInputIdx < fAnchorLimit) {
3206                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3207            } else {
3208                fHitEnd = TRUE;
3209                fRequireEnd = TRUE;
3210            }
3211            break;
3212
3213
3214
3215        case URX_STATIC_SETREF:
3216            {
3217                // Test input character against one of the predefined sets
3218                //    (Word Characters, for example)
3219                // The high bit of the op value is a flag for the match polarity.
3220                //    0:   success if input char is in set.
3221                //    1:   success if input char is not in set.
3222                if (fp->fInputIdx >= fActiveLimit) {
3223                    fHitEnd = TRUE;
3224                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3225                    break;
3226                }
3227
3228                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3229                opValue &= ~URX_NEG_SET;
3230                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3231
3232                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3233                UChar32 c = UTEXT_NEXT32(fInputText);
3234                if (c < 256) {
3235                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3236                    if (s8->contains(c)) {
3237                        success = !success;
3238                    }
3239                } else {
3240                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3241                    if (s->contains(c)) {
3242                        success = !success;
3243                    }
3244                }
3245                if (success) {
3246                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3247                } else {
3248                    // the character wasn't in the set.
3249                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3250                }
3251            }
3252            break;
3253
3254
3255        case URX_STAT_SETREF_N:
3256            {
3257                // Test input character for NOT being a member of  one of
3258                //    the predefined sets (Word Characters, for example)
3259                if (fp->fInputIdx >= fActiveLimit) {
3260                    fHitEnd = TRUE;
3261                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3262                    break;
3263                }
3264
3265                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3266
3267                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3268
3269                UChar32 c = UTEXT_NEXT32(fInputText);
3270                if (c < 256) {
3271                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3272                    if (s8->contains(c) == FALSE) {
3273                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3274                        break;
3275                    }
3276                } else {
3277                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3278                    if (s->contains(c) == FALSE) {
3279                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3280                        break;
3281                    }
3282                }
3283                // the character wasn't in the set.
3284                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3285            }
3286            break;
3287
3288
3289        case URX_SETREF:
3290            if (fp->fInputIdx >= fActiveLimit) {
3291                fHitEnd = TRUE;
3292                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293                break;
3294            } else {
3295                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3296
3297                // There is input left.  Pick up one char and test it for set membership.
3298                UChar32 c = UTEXT_NEXT32(fInputText);
3299                U_ASSERT(opValue > 0 && opValue < sets->size());
3300                if (c<256) {
3301                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3302                    if (s8->contains(c)) {
3303                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3304                        break;
3305                    }
3306                } else {
3307                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3308                    if (s->contains(c)) {
3309                        // The character is in the set.  A Match.
3310                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3311                        break;
3312                    }
3313                }
3314
3315                // the character wasn't in the set.
3316                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3317            }
3318            break;
3319
3320
3321        case URX_DOTANY:
3322            {
3323                // . matches anything, but stops at end-of-line.
3324                if (fp->fInputIdx >= fActiveLimit) {
3325                    // At end of input.  Match failed.  Backtrack out.
3326                    fHitEnd = TRUE;
3327                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3328                    break;
3329                }
3330
3331                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3332
3333                // There is input left.  Advance over one char, unless we've hit end-of-line
3334                UChar32 c = UTEXT_NEXT32(fInputText);
3335                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
3336                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
3337                    // End of line in normal mode.   . does not match.
3338                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3339                    break;
3340                }
3341                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3342            }
3343            break;
3344
3345
3346        case URX_DOTANY_ALL:
3347            {
3348                // ., in dot-matches-all (including new lines) mode
3349                if (fp->fInputIdx >= fActiveLimit) {
3350                    // At end of input.  Match failed.  Backtrack out.
3351                    fHitEnd = TRUE;
3352                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3353                    break;
3354                }
3355
3356                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3357
3358                // There is input left.  Advance over one char, except if we are
3359                //   at a cr/lf, advance over both of them.
3360                UChar32 c;
3361                c = UTEXT_NEXT32(fInputText);
3362                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3363                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3364                    // In the case of a CR/LF, we need to advance over both.
3365                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
3366                    if (nextc == 0x0a) {
3367                        (void)UTEXT_NEXT32(fInputText);
3368                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3369                    }
3370                }
3371            }
3372            break;
3373
3374
3375        case URX_DOTANY_UNIX:
3376            {
3377                // '.' operator, matches all, but stops at end-of-line.
3378                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3379                if (fp->fInputIdx >= fActiveLimit) {
3380                    // At end of input.  Match failed.  Backtrack out.
3381                    fHitEnd = TRUE;
3382                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3383                    break;
3384                }
3385
3386                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3387
3388                // There is input left.  Advance over one char, unless we've hit end-of-line
3389                UChar32 c = UTEXT_NEXT32(fInputText);
3390                if (c == 0x0a) {
3391                    // End of line in normal mode.   '.' does not match the \n
3392                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3393                } else {
3394                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3395                }
3396            }
3397            break;
3398
3399
3400        case URX_JMP:
3401            fp->fPatIdx = opValue;
3402            break;
3403
3404        case URX_FAIL:
3405            isMatch = FALSE;
3406            goto breakFromLoop;
3407
3408        case URX_JMP_SAV:
3409            U_ASSERT(opValue < fPattern->fCompiledPat->size());
3410            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3411            fp->fPatIdx = opValue;                         // Then JMP.
3412            break;
3413
3414        case URX_JMP_SAV_X:
3415            // This opcode is used with (x)+, when x can match a zero length string.
3416            // Same as JMP_SAV, except conditional on the match having made forward progress.
3417            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3418            //   data address of the input position at the start of the loop.
3419            {
3420                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3421                int32_t  stoOp = (int32_t)pat[opValue-1];
3422                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3423                int32_t  frameLoc = URX_VAL(stoOp);
3424                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3425                int64_t prevInputIdx = fp->fExtra[frameLoc];
3426                U_ASSERT(prevInputIdx <= fp->fInputIdx);
3427                if (prevInputIdx < fp->fInputIdx) {
3428                    // The match did make progress.  Repeat the loop.
3429                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3430                    fp->fPatIdx = opValue;
3431                    fp->fExtra[frameLoc] = fp->fInputIdx;
3432                }
3433                // If the input position did not advance, we do nothing here,
3434                //   execution will fall out of the loop.
3435            }
3436            break;
3437
3438        case URX_CTR_INIT:
3439            {
3440                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3441                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3442
3443                // Pick up the three extra operands that CTR_INIT has, and
3444                //    skip the pattern location counter past
3445                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3446                fp->fPatIdx += 3;
3447                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3448                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3449                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3450                U_ASSERT(minCount>=0);
3451                U_ASSERT(maxCount>=minCount || maxCount==-1);
3452                U_ASSERT(loopLoc>=fp->fPatIdx);
3453
3454                if (minCount == 0) {
3455                    fp = StateSave(fp, loopLoc+1, status);
3456                }
3457                if (maxCount == -1) {
3458                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3459                } else if (maxCount == 0) {
3460                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3461                }
3462            }
3463            break;
3464
3465        case URX_CTR_LOOP:
3466            {
3467                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3468                int32_t initOp = (int32_t)pat[opValue];
3469                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3470                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3471                int32_t minCount  = (int32_t)pat[opValue+2];
3472                int32_t maxCount  = (int32_t)pat[opValue+3];
3473                (*pCounter)++;
3474                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3475                    U_ASSERT(*pCounter == maxCount);
3476                    break;
3477                }
3478                if (*pCounter >= minCount) {
3479                    if (maxCount == -1) {
3480                        // Loop has no hard upper bound.
3481                        // Check that it is progressing through the input, break if it is not.
3482                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3483                        if (fp->fInputIdx == *pLastInputIdx) {
3484                            break;
3485                        } else {
3486                            *pLastInputIdx = fp->fInputIdx;
3487                        }
3488                    }
3489                    fp = StateSave(fp, fp->fPatIdx, status);
3490                }
3491                fp->fPatIdx = opValue + 4;    // Loop back.
3492            }
3493            break;
3494
3495        case URX_CTR_INIT_NG:
3496            {
3497                // Initialize a non-greedy loop
3498                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3499                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3500
3501                // Pick up the three extra operands that CTR_INIT_NG has, and
3502                //    skip the pattern location counter past
3503                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3504                fp->fPatIdx += 3;
3505                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3506                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3507                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3508                U_ASSERT(minCount>=0);
3509                U_ASSERT(maxCount>=minCount || maxCount==-1);
3510                U_ASSERT(loopLoc>fp->fPatIdx);
3511                if (maxCount == -1) {
3512                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3513                }
3514
3515                if (minCount == 0) {
3516                    if (maxCount != 0) {
3517                        fp = StateSave(fp, fp->fPatIdx, status);
3518                    }
3519                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3520                }
3521            }
3522            break;
3523
3524        case URX_CTR_LOOP_NG:
3525            {
3526                // Non-greedy {min, max} loops
3527                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3528                int32_t initOp = (int32_t)pat[opValue];
3529                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3530                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3531                int32_t minCount  = (int32_t)pat[opValue+2];
3532                int32_t maxCount  = (int32_t)pat[opValue+3];
3533
3534                (*pCounter)++;
3535                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3536                    // The loop has matched the maximum permitted number of times.
3537                    //   Break out of here with no action.  Matching will
3538                    //   continue with the following pattern.
3539                    U_ASSERT(*pCounter == maxCount);
3540                    break;
3541                }
3542
3543                if (*pCounter < minCount) {
3544                    // We haven't met the minimum number of matches yet.
3545                    //   Loop back for another one.
3546                    fp->fPatIdx = opValue + 4;    // Loop back.
3547                } else {
3548                    // We do have the minimum number of matches.
3549
3550                    // If there is no upper bound on the loop iterations, check that the input index
3551                    // is progressing, and stop the loop if it is not.
3552                    if (maxCount == -1) {
3553                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3554                        if (fp->fInputIdx == *pLastInputIdx) {
3555                            break;
3556                        }
3557                        *pLastInputIdx = fp->fInputIdx;
3558                    }
3559
3560                    // Loop Continuation: we will fall into the pattern following the loop
3561                    //   (non-greedy, don't execute loop body first), but first do
3562                    //   a state save to the top of the loop, so that a match failure
3563                    //   in the following pattern will try another iteration of the loop.
3564                    fp = StateSave(fp, opValue + 4, status);
3565                }
3566            }
3567            break;
3568
3569        case URX_STO_SP:
3570            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3571            fData[opValue] = fStack->size();
3572            break;
3573
3574        case URX_LD_SP:
3575            {
3576                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3577                int32_t newStackSize = (int32_t)fData[opValue];
3578                U_ASSERT(newStackSize <= fStack->size());
3579                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3580                if (newFP == (int64_t *)fp) {
3581                    break;
3582                }
3583                int32_t i;
3584                for (i=0; i<fFrameSize; i++) {
3585                    newFP[i] = ((int64_t *)fp)[i];
3586                }
3587                fp = (REStackFrame *)newFP;
3588                fStack->setSize(newStackSize);
3589            }
3590            break;
3591
3592        case URX_BACKREF:
3593            {
3594                U_ASSERT(opValue < fFrameSize);
3595                int64_t groupStartIdx = fp->fExtra[opValue];
3596                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3597                U_ASSERT(groupStartIdx <= groupEndIdx);
3598                if (groupStartIdx < 0) {
3599                    // This capture group has not participated in the match thus far,
3600                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3601                    break;
3602                }
3603                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3604                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3605
3606                //   Note: if the capture group match was of an empty string the backref
3607                //         match succeeds.  Verified by testing:  Perl matches succeed
3608                //         in this case, so we do too.
3609
3610                UBool success = TRUE;
3611                for (;;) {
3612                    if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3613                        success = TRUE;
3614                        break;
3615                    }
3616                    if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3617                        success = FALSE;
3618                        fHitEnd = TRUE;
3619                        break;
3620                    }
3621                    UChar32 captureGroupChar = utext_next32(fAltInputText);
3622                    UChar32 inputChar = utext_next32(fInputText);
3623                    if (inputChar != captureGroupChar) {
3624                        success = FALSE;
3625                        break;
3626                    }
3627                }
3628
3629                if (success) {
3630                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3631                } else {
3632                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3633                }
3634            }
3635            break;
3636
3637
3638
3639        case URX_BACKREF_I:
3640            {
3641                U_ASSERT(opValue < fFrameSize);
3642                int64_t groupStartIdx = fp->fExtra[opValue];
3643                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3644                U_ASSERT(groupStartIdx <= groupEndIdx);
3645                if (groupStartIdx < 0) {
3646                    // This capture group has not participated in the match thus far,
3647                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3648                    break;
3649                }
3650                utext_setNativeIndex(fAltInputText, groupStartIdx);
3651                utext_setNativeIndex(fInputText, fp->fInputIdx);
3652                CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3653                CaseFoldingUTextIterator inputItr(*fInputText);
3654
3655                //   Note: if the capture group match was of an empty string the backref
3656                //         match succeeds.  Verified by testing:  Perl matches succeed
3657                //         in this case, so we do too.
3658
3659                UBool success = TRUE;
3660                for (;;) {
3661                    if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3662                        success = TRUE;
3663                        break;
3664                    }
3665                    if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3666                        success = FALSE;
3667                        fHitEnd = TRUE;
3668                        break;
3669                    }
3670                    UChar32 captureGroupChar = captureGroupItr.next();
3671                    UChar32 inputChar = inputItr.next();
3672                    if (inputChar != captureGroupChar) {
3673                        success = FALSE;
3674                        break;
3675                    }
3676                }
3677
3678                if (success && inputItr.inExpansion()) {
3679                    // We otained a match by consuming part of a string obtained from
3680                    // case-folding a single code point of the input text.
3681                    // This does not count as an overall match.
3682                    success = FALSE;
3683                }
3684
3685                if (success) {
3686                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3687                } else {
3688                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3689                }
3690
3691            }
3692            break;
3693
3694        case URX_STO_INP_LOC:
3695            {
3696                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3697                fp->fExtra[opValue] = fp->fInputIdx;
3698            }
3699            break;
3700
3701        case URX_JMPX:
3702            {
3703                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3704                fp->fPatIdx += 1;
3705                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3706                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3707                int64_t savedInputIdx = fp->fExtra[dataLoc];
3708                U_ASSERT(savedInputIdx <= fp->fInputIdx);
3709                if (savedInputIdx < fp->fInputIdx) {
3710                    fp->fPatIdx = opValue;                               // JMP
3711                } else {
3712                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3713                }
3714            }
3715            break;
3716
3717        case URX_LA_START:
3718            {
3719                // Entering a lookahead block.
3720                // Save Stack Ptr, Input Pos.
3721                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3722                fData[opValue]   = fStack->size();
3723                fData[opValue+1] = fp->fInputIdx;
3724                fActiveStart     = fLookStart;          // Set the match region change for
3725                fActiveLimit     = fLookLimit;          //   transparent bounds.
3726            }
3727            break;
3728
3729        case URX_LA_END:
3730            {
3731                // Leaving a look-ahead block.
3732                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3733                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3734                int32_t stackSize = fStack->size();
3735                int32_t newStackSize =(int32_t)fData[opValue];
3736                U_ASSERT(stackSize >= newStackSize);
3737                if (stackSize > newStackSize) {
3738                    // Copy the current top frame back to the new (cut back) top frame.
3739                    //   This makes the capture groups from within the look-ahead
3740                    //   expression available.
3741                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3742                    int32_t i;
3743                    for (i=0; i<fFrameSize; i++) {
3744                        newFP[i] = ((int64_t *)fp)[i];
3745                    }
3746                    fp = (REStackFrame *)newFP;
3747                    fStack->setSize(newStackSize);
3748                }
3749                fp->fInputIdx = fData[opValue+1];
3750
3751                // Restore the active region bounds in the input string; they may have
3752                //    been changed because of transparent bounds on a Region.
3753                fActiveStart = fRegionStart;
3754                fActiveLimit = fRegionLimit;
3755            }
3756            break;
3757
3758        case URX_ONECHAR_I:
3759            // Case insensitive one char.  The char from the pattern is already case folded.
3760            // Input text is not, but case folding the input can not reduce two or more code
3761            // points to one.
3762            if (fp->fInputIdx < fActiveLimit) {
3763                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3764
3765                UChar32 c = UTEXT_NEXT32(fInputText);
3766                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3767                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3768                    break;
3769                }
3770            } else {
3771                fHitEnd = TRUE;
3772            }
3773
3774            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3775            break;
3776
3777        case URX_STRING_I:
3778            {
3779                // Case-insensitive test input against a literal string.
3780                // Strings require two slots in the compiled pattern, one for the
3781                //   offset to the string text, and one for the length.
3782                //   The compiled string has already been case folded.
3783                {
3784                    const UChar *patternString = litText + opValue;
3785                    int32_t      patternStringIdx  = 0;
3786
3787                    op      = (int32_t)pat[fp->fPatIdx];
3788                    fp->fPatIdx++;
3789                    opType  = URX_TYPE(op);
3790                    opValue = URX_VAL(op);
3791                    U_ASSERT(opType == URX_STRING_LEN);
3792                    int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3793
3794
3795                    UChar32   cPattern;
3796                    UChar32   cText;
3797                    UBool     success = TRUE;
3798
3799                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3800                    CaseFoldingUTextIterator inputIterator(*fInputText);
3801                    while (patternStringIdx < patternStringLen) {
3802                        if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3803                            success = FALSE;
3804                            fHitEnd = TRUE;
3805                            break;
3806                        }
3807                        U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3808                        cText = inputIterator.next();
3809                        if (cText != cPattern) {
3810                            success = FALSE;
3811                            break;
3812                        }
3813                    }
3814                    if (inputIterator.inExpansion()) {
3815                        success = FALSE;
3816                    }
3817
3818                    if (success) {
3819                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3820                    } else {
3821                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3822                    }
3823                }
3824            }
3825            break;
3826
3827        case URX_LB_START:
3828            {
3829                // Entering a look-behind block.
3830                // Save Stack Ptr, Input Pos.
3831                //   TODO:  implement transparent bounds.  Ticket #6067
3832                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3833                fData[opValue]   = fStack->size();
3834                fData[opValue+1] = fp->fInputIdx;
3835                // Init the variable containing the start index for attempted matches.
3836                fData[opValue+2] = -1;
3837                // Save input string length, then reset to pin any matches to end at
3838                //   the current position.
3839                fData[opValue+3] = fActiveLimit;
3840                fActiveLimit     = fp->fInputIdx;
3841            }
3842            break;
3843
3844
3845        case URX_LB_CONT:
3846            {
3847                // Positive Look-Behind, at top of loop checking for matches of LB expression
3848                //    at all possible input starting positions.
3849
3850                // Fetch the min and max possible match lengths.  They are the operands
3851                //   of this op in the pattern.
3852                int32_t minML = (int32_t)pat[fp->fPatIdx++];
3853                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3854                U_ASSERT(minML <= maxML);
3855                U_ASSERT(minML >= 0);
3856
3857                // Fetch (from data) the last input index where a match was attempted.
3858                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3859                int64_t  *lbStartIdx = &fData[opValue+2];
3860                if (*lbStartIdx < 0) {
3861                    // First time through loop.
3862                    *lbStartIdx = fp->fInputIdx - minML;
3863                } else {
3864                    // 2nd through nth time through the loop.
3865                    // Back up start position for match by one.
3866                    if (*lbStartIdx == 0) {
3867                        (*lbStartIdx)--;
3868                    } else {
3869                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3870                        (void)UTEXT_PREVIOUS32(fInputText);
3871                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3872                    }
3873                }
3874
3875                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3876                    // We have tried all potential match starting points without
3877                    //  getting a match.  Backtrack out, and out of the
3878                    //   Look Behind altogether.
3879                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3880                    int64_t restoreInputLen = fData[opValue+3];
3881                    U_ASSERT(restoreInputLen >= fActiveLimit);
3882                    U_ASSERT(restoreInputLen <= fInputLength);
3883                    fActiveLimit = restoreInputLen;
3884                    break;
3885                }
3886
3887                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3888                //      (successful match will fall off the end of the loop.)
3889                fp = StateSave(fp, fp->fPatIdx-3, status);
3890                fp->fInputIdx = *lbStartIdx;
3891            }
3892            break;
3893
3894        case URX_LB_END:
3895            // End of a look-behind block, after a successful match.
3896            {
3897                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3898                if (fp->fInputIdx != fActiveLimit) {
3899                    //  The look-behind expression matched, but the match did not
3900                    //    extend all the way to the point that we are looking behind from.
3901                    //  FAIL out of here, which will take us back to the LB_CONT, which
3902                    //     will retry the match starting at another position or fail
3903                    //     the look-behind altogether, whichever is appropriate.
3904                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3905                    break;
3906                }
3907
3908                // Look-behind match is good.  Restore the orignal input string length,
3909                //   which had been truncated to pin the end of the lookbehind match to the
3910                //   position being looked-behind.
3911                int64_t originalInputLen = fData[opValue+3];
3912                U_ASSERT(originalInputLen >= fActiveLimit);
3913                U_ASSERT(originalInputLen <= fInputLength);
3914                fActiveLimit = originalInputLen;
3915            }
3916            break;
3917
3918
3919        case URX_LBN_CONT:
3920            {
3921                // Negative Look-Behind, at top of loop checking for matches of LB expression
3922                //    at all possible input starting positions.
3923
3924                // Fetch the extra parameters of this op.
3925                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
3926                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
3927                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
3928                        continueLoc = URX_VAL(continueLoc);
3929                U_ASSERT(minML <= maxML);
3930                U_ASSERT(minML >= 0);
3931                U_ASSERT(continueLoc > fp->fPatIdx);
3932
3933                // Fetch (from data) the last input index where a match was attempted.
3934                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3935                int64_t  *lbStartIdx = &fData[opValue+2];
3936                if (*lbStartIdx < 0) {
3937                    // First time through loop.
3938                    *lbStartIdx = fp->fInputIdx - minML;
3939                } else {
3940                    // 2nd through nth time through the loop.
3941                    // Back up start position for match by one.
3942                    if (*lbStartIdx == 0) {
3943                        (*lbStartIdx)--;
3944                    } else {
3945                        UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3946                        (void)UTEXT_PREVIOUS32(fInputText);
3947                        *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3948                    }
3949                }
3950
3951                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3952                    // We have tried all potential match starting points without
3953                    //  getting a match, which means that the negative lookbehind as
3954                    //  a whole has succeeded.  Jump forward to the continue location
3955                    int64_t restoreInputLen = fData[opValue+3];
3956                    U_ASSERT(restoreInputLen >= fActiveLimit);
3957                    U_ASSERT(restoreInputLen <= fInputLength);
3958                    fActiveLimit = restoreInputLen;
3959                    fp->fPatIdx = continueLoc;
3960                    break;
3961                }
3962
3963                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3964                //      (successful match will cause a FAIL out of the loop altogether.)
3965                fp = StateSave(fp, fp->fPatIdx-4, status);
3966                fp->fInputIdx = *lbStartIdx;
3967            }
3968            break;
3969
3970        case URX_LBN_END:
3971            // End of a negative look-behind block, after a successful match.
3972            {
3973                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3974                if (fp->fInputIdx != fActiveLimit) {
3975                    //  The look-behind expression matched, but the match did not
3976                    //    extend all the way to the point that we are looking behind from.
3977                    //  FAIL out of here, which will take us back to the LB_CONT, which
3978                    //     will retry the match starting at another position or succeed
3979                    //     the look-behind altogether, whichever is appropriate.
3980                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3981                    break;
3982                }
3983
3984                // Look-behind expression matched, which means look-behind test as
3985                //   a whole Fails
3986
3987                //   Restore the orignal input string length, which had been truncated
3988                //   inorder to pin the end of the lookbehind match
3989                //   to the position being looked-behind.
3990                int64_t originalInputLen = fData[opValue+3];
3991                U_ASSERT(originalInputLen >= fActiveLimit);
3992                U_ASSERT(originalInputLen <= fInputLength);
3993                fActiveLimit = originalInputLen;
3994
3995                // Restore original stack position, discarding any state saved
3996                //   by the successful pattern match.
3997                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3998                int32_t newStackSize = (int32_t)fData[opValue];
3999                U_ASSERT(fStack->size() > newStackSize);
4000                fStack->setSize(newStackSize);
4001
4002                //  FAIL, which will take control back to someplace
4003                //  prior to entering the look-behind test.
4004                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4005            }
4006            break;
4007
4008
4009        case URX_LOOP_SR_I:
4010            // Loop Initialization for the optimized implementation of
4011            //     [some character set]*
4012            //   This op scans through all matching input.
4013            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4014            {
4015                U_ASSERT(opValue > 0 && opValue < sets->size());
4016                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4017                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4018
4019                // Loop through input, until either the input is exhausted or
4020                //   we reach a character that is not a member of the set.
4021                int64_t ix = fp->fInputIdx;
4022                UTEXT_SETNATIVEINDEX(fInputText, ix);
4023                for (;;) {
4024                    if (ix >= fActiveLimit) {
4025                        fHitEnd = TRUE;
4026                        break;
4027                    }
4028                    UChar32 c = UTEXT_NEXT32(fInputText);
4029                    if (c<256) {
4030                        if (s8->contains(c) == FALSE) {
4031                            break;
4032                        }
4033                    } else {
4034                        if (s->contains(c) == FALSE) {
4035                            break;
4036                        }
4037                    }
4038                    ix = UTEXT_GETNATIVEINDEX(fInputText);
4039                }
4040
4041                // If there were no matching characters, skip over the loop altogether.
4042                //   The loop doesn't run at all, a * op always succeeds.
4043                if (ix == fp->fInputIdx) {
4044                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4045                    break;
4046                }
4047
4048                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4049                //   must follow.  It's operand is the stack location
4050                //   that holds the starting input index for the match of this [set]*
4051                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4052                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4053                int32_t stackLoc = URX_VAL(loopcOp);
4054                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4055                fp->fExtra[stackLoc] = fp->fInputIdx;
4056                fp->fInputIdx = ix;
4057
4058                // Save State to the URX_LOOP_C op that follows this one,
4059                //   so that match failures in the following code will return to there.
4060                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4061                fp = StateSave(fp, fp->fPatIdx, status);
4062                fp->fPatIdx++;
4063            }
4064            break;
4065
4066
4067        case URX_LOOP_DOT_I:
4068            // Loop Initialization for the optimized implementation of .*
4069            //   This op scans through all remaining input.
4070            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4071            {
4072                // Loop through input until the input is exhausted (we reach an end-of-line)
4073                // In DOTALL mode, we can just go straight to the end of the input.
4074                int64_t ix;
4075                if ((opValue & 1) == 1) {
4076                    // Dot-matches-All mode.  Jump straight to the end of the string.
4077                    ix = fActiveLimit;
4078                    fHitEnd = TRUE;
4079                } else {
4080                    // NOT DOT ALL mode.  Line endings do not match '.'
4081                    // Scan forward until a line ending or end of input.
4082                    ix = fp->fInputIdx;
4083                    UTEXT_SETNATIVEINDEX(fInputText, ix);
4084                    for (;;) {
4085                        if (ix >= fActiveLimit) {
4086                            fHitEnd = TRUE;
4087                            break;
4088                        }
4089                        UChar32 c = UTEXT_NEXT32(fInputText);
4090                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4091                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4092                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4093                                    (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) {
4094                                //  char is a line ending.  Exit the scanning loop.
4095                                break;
4096                            }
4097                        }
4098                        ix = UTEXT_GETNATIVEINDEX(fInputText);
4099                    }
4100                }
4101
4102                // If there were no matching characters, skip over the loop altogether.
4103                //   The loop doesn't run at all, a * op always succeeds.
4104                if (ix == fp->fInputIdx) {
4105                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4106                    break;
4107                }
4108
4109                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4110                //   must follow.  It's operand is the stack location
4111                //   that holds the starting input index for the match of this .*
4112                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4113                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4114                int32_t stackLoc = URX_VAL(loopcOp);
4115                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4116                fp->fExtra[stackLoc] = fp->fInputIdx;
4117                fp->fInputIdx = ix;
4118
4119                // Save State to the URX_LOOP_C op that follows this one,
4120                //   so that match failures in the following code will return to there.
4121                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4122                fp = StateSave(fp, fp->fPatIdx, status);
4123                fp->fPatIdx++;
4124            }
4125            break;
4126
4127
4128        case URX_LOOP_C:
4129            {
4130                U_ASSERT(opValue>=0 && opValue<fFrameSize);
4131                backSearchIndex = fp->fExtra[opValue];
4132                U_ASSERT(backSearchIndex <= fp->fInputIdx);
4133                if (backSearchIndex == fp->fInputIdx) {
4134                    // We've backed up the input idx to the point that the loop started.
4135                    // The loop is done.  Leave here without saving state.
4136                    //  Subsequent failures won't come back here.
4137                    break;
4138                }
4139                // Set up for the next iteration of the loop, with input index
4140                //   backed up by one from the last time through,
4141                //   and a state save to this instruction in case the following code fails again.
4142                //   (We're going backwards because this loop emulates stack unwinding, not
4143                //    the initial scan forward.)
4144                U_ASSERT(fp->fInputIdx > 0);
4145                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4146                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4147                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4148
4149                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4150                if (prevC == 0x0a &&
4151                    fp->fInputIdx > backSearchIndex &&
4152                    twoPrevC == 0x0d) {
4153                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4154                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4155                        // .*, stepping back over CRLF pair.
4156                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4157                    }
4158                }
4159
4160
4161                fp = StateSave(fp, fp->fPatIdx-1, status);
4162            }
4163            break;
4164
4165
4166
4167        default:
4168            // Trouble.  The compiled pattern contains an entry with an
4169            //           unrecognized type tag.
4170            U_ASSERT(FALSE);
4171        }
4172
4173        if (U_FAILURE(status)) {
4174            isMatch = FALSE;
4175            break;
4176        }
4177    }
4178
4179breakFromLoop:
4180    fMatch = isMatch;
4181    if (isMatch) {
4182        fLastMatchEnd = fMatchEnd;
4183        fMatchStart   = startIdx;
4184        fMatchEnd     = fp->fInputIdx;
4185    }
4186
4187#ifdef REGEX_RUN_DEBUG
4188    if (fTraceDebug) {
4189        if (isMatch) {
4190            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4191        } else {
4192            printf("No match\n\n");
4193        }
4194    }
4195#endif
4196
4197    fFrame = fp;                // The active stack frame when the engine stopped.
4198                                //   Contains the capture group results that we need to
4199                                //    access later.
4200    return;
4201}
4202
4203
4204//--------------------------------------------------------------------------------
4205//
4206//   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4207//                  assumption that the entire string is available in the UText's
4208//                  chunk buffer. For now, that means we can use int32_t indexes,
4209//                  except for anything that needs to be saved (like group starts
4210//                  and ends).
4211//
4212//                  startIdx:    begin matching a this index.
4213//                  toEnd:       if true, match must extend to end of the input region
4214//
4215//--------------------------------------------------------------------------------
4216void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4217    UBool       isMatch  = FALSE;      // True if the we have a match.
4218
4219    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4220
4221    int32_t     op;                    // Operation from the compiled pattern, split into
4222    int32_t     opType;                //    the opcode
4223    int32_t     opValue;               //    and the operand value.
4224
4225#ifdef REGEX_RUN_DEBUG
4226    if (fTraceDebug) {
4227        printf("MatchAt(startIdx=%d)\n", startIdx);
4228        printf("Original Pattern: ");
4229        UChar32 c = utext_next32From(fPattern->fPattern, 0);
4230        while (c != U_SENTINEL) {
4231            if (c<32 || c>256) {
4232                c = '.';
4233            }
4234            printf("%c", c);
4235
4236            c = UTEXT_NEXT32(fPattern->fPattern);
4237        }
4238        printf("\n");
4239        printf("Input String: ");
4240        c = utext_next32From(fInputText, 0);
4241        while (c != U_SENTINEL) {
4242            if (c<32 || c>256) {
4243                c = '.';
4244            }
4245            printf("%c", c);
4246
4247            c = UTEXT_NEXT32(fInputText);
4248        }
4249        printf("\n");
4250        printf("\n");
4251    }
4252#endif
4253
4254    if (U_FAILURE(status)) {
4255        return;
4256    }
4257
4258    //  Cache frequently referenced items from the compiled pattern
4259    //
4260    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4261
4262    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4263    UVector             *sets          = fPattern->fSets;
4264
4265    const UChar         *inputBuf      = fInputText->chunkContents;
4266
4267    fFrameSize = fPattern->fFrameSize;
4268    REStackFrame        *fp            = resetStack();
4269
4270    fp->fPatIdx   = 0;
4271    fp->fInputIdx = startIdx;
4272
4273    // Zero out the pattern's static data
4274    int32_t i;
4275    for (i = 0; i<fPattern->fDataSize; i++) {
4276        fData[i] = 0;
4277    }
4278
4279    //
4280    //  Main loop for interpreting the compiled pattern.
4281    //  One iteration of the loop per pattern operation performed.
4282    //
4283    for (;;) {
4284        op      = (int32_t)pat[fp->fPatIdx];
4285        opType  = URX_TYPE(op);
4286        opValue = URX_VAL(op);
4287#ifdef REGEX_RUN_DEBUG
4288        if (fTraceDebug) {
4289            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4290            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4291                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4292            fPattern->dumpOp(fp->fPatIdx);
4293        }
4294#endif
4295        fp->fPatIdx++;
4296
4297        switch (opType) {
4298
4299
4300        case URX_NOP:
4301            break;
4302
4303
4304        case URX_BACKTRACK:
4305            // Force a backtrack.  In some circumstances, the pattern compiler
4306            //   will notice that the pattern can't possibly match anything, and will
4307            //   emit one of these at that point.
4308            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4309            break;
4310
4311
4312        case URX_ONECHAR:
4313            if (fp->fInputIdx < fActiveLimit) {
4314                UChar32 c;
4315                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4316                if (c == opValue) {
4317                    break;
4318                }
4319            } else {
4320                fHitEnd = TRUE;
4321            }
4322            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4323            break;
4324
4325
4326        case URX_STRING:
4327            {
4328                // Test input against a literal string.
4329                // Strings require two slots in the compiled pattern, one for the
4330                //   offset to the string text, and one for the length.
4331                int32_t   stringStartIdx = opValue;
4332                int32_t   stringLen;
4333
4334                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4335                fp->fPatIdx++;
4336                opType    = URX_TYPE(op);
4337                stringLen = URX_VAL(op);
4338                U_ASSERT(opType == URX_STRING_LEN);
4339                U_ASSERT(stringLen >= 2);
4340
4341                const UChar * pInp = inputBuf + fp->fInputIdx;
4342                const UChar * pInpLimit = inputBuf + fActiveLimit;
4343                const UChar * pPat = litText+stringStartIdx;
4344                const UChar * pEnd = pInp + stringLen;
4345                UBool success = TRUE;
4346                while (pInp < pEnd) {
4347                    if (pInp >= pInpLimit) {
4348                        fHitEnd = TRUE;
4349                        success = FALSE;
4350                        break;
4351                    }
4352                    if (*pInp++ != *pPat++) {
4353                        success = FALSE;
4354                        break;
4355                    }
4356                }
4357
4358                if (success) {
4359                    fp->fInputIdx += stringLen;
4360                } else {
4361                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4362                }
4363            }
4364            break;
4365
4366
4367        case URX_STATE_SAVE:
4368            fp = StateSave(fp, opValue, status);
4369            break;
4370
4371
4372        case URX_END:
4373            // The match loop will exit via this path on a successful match,
4374            //   when we reach the end of the pattern.
4375            if (toEnd && fp->fInputIdx != fActiveLimit) {
4376                // The pattern matched, but not to the end of input.  Try some more.
4377                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4378                break;
4379            }
4380            isMatch = TRUE;
4381            goto  breakFromLoop;
4382
4383            // Start and End Capture stack frame variables are laid out out like this:
4384            //  fp->fExtra[opValue]  - The start of a completed capture group
4385            //             opValue+1 - The end   of a completed capture group
4386            //             opValue+2 - the start of a capture group whose end
4387            //                          has not yet been reached (and might not ever be).
4388        case URX_START_CAPTURE:
4389            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4390            fp->fExtra[opValue+2] = fp->fInputIdx;
4391            break;
4392
4393
4394        case URX_END_CAPTURE:
4395            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4396            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4397            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4398            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4399            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4400            break;
4401
4402
4403        case URX_DOLLAR:                   //  $, test for End of line
4404            //     or for position before new line at end of input
4405            if (fp->fInputIdx < fAnchorLimit-2) {
4406                // We are no where near the end of input.  Fail.
4407                //   This is the common case.  Keep it first.
4408                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4409                break;
4410            }
4411            if (fp->fInputIdx >= fAnchorLimit) {
4412                // We really are at the end of input.  Success.
4413                fHitEnd = TRUE;
4414                fRequireEnd = TRUE;
4415                break;
4416            }
4417
4418            // If we are positioned just before a new-line that is located at the
4419            //   end of input, succeed.
4420            if (fp->fInputIdx == fAnchorLimit-1) {
4421                UChar32 c;
4422                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4423
4424                if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) {
4425                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4426                        // At new-line at end of input. Success
4427                        fHitEnd = TRUE;
4428                        fRequireEnd = TRUE;
4429                        break;
4430                    }
4431                }
4432            } else if (fp->fInputIdx == fAnchorLimit-2 &&
4433                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4434                    fHitEnd = TRUE;
4435                    fRequireEnd = TRUE;
4436                    break;                         // At CR/LF at end of input.  Success
4437            }
4438
4439            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4440
4441            break;
4442
4443
4444        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4445            if (fp->fInputIdx >= fAnchorLimit-1) {
4446                // Either at the last character of input, or off the end.
4447                if (fp->fInputIdx == fAnchorLimit-1) {
4448                    // At last char of input.  Success if it's a new line.
4449                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4450                        fHitEnd = TRUE;
4451                        fRequireEnd = TRUE;
4452                        break;
4453                    }
4454                } else {
4455                    // Off the end of input.  Success.
4456                    fHitEnd = TRUE;
4457                    fRequireEnd = TRUE;
4458                    break;
4459                }
4460            }
4461
4462            // Not at end of input.  Back-track out.
4463            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4464            break;
4465
4466
4467        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4468            {
4469                if (fp->fInputIdx >= fAnchorLimit) {
4470                    // We really are at the end of input.  Success.
4471                    fHitEnd = TRUE;
4472                    fRequireEnd = TRUE;
4473                    break;
4474                }
4475                // If we are positioned just before a new-line, succeed.
4476                // It makes no difference where the new-line is within the input.
4477                UChar32 c = inputBuf[fp->fInputIdx];
4478                if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) {
4479                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4480                    //  In multi-line mode, hitting a new-line just before the end of input does not
4481                    //   set the hitEnd or requireEnd flags
4482                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4483                        break;
4484                    }
4485                }
4486                // not at a new line.  Fail.
4487                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4488            }
4489            break;
4490
4491
4492        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4493            {
4494                if (fp->fInputIdx >= fAnchorLimit) {
4495                    // We really are at the end of input.  Success.
4496                    fHitEnd = TRUE;
4497                    fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4498                    break;               //   adding a new-line would not lose the match.
4499                }
4500                // If we are not positioned just before a new-line, the test fails; backtrack out.
4501                // It makes no difference where the new-line is within the input.
4502                if (inputBuf[fp->fInputIdx] != 0x0a) {
4503                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4504                }
4505            }
4506            break;
4507
4508
4509        case URX_CARET:                    //  ^, test for start of line
4510            if (fp->fInputIdx != fAnchorStart) {
4511                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4512            }
4513            break;
4514
4515
4516        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4517            {
4518                if (fp->fInputIdx == fAnchorStart) {
4519                    // We are at the start input.  Success.
4520                    break;
4521                }
4522                // Check whether character just before the current pos is a new-line
4523                //   unless we are at the end of input
4524                UChar  c = inputBuf[fp->fInputIdx - 1];
4525                if ((fp->fInputIdx < fAnchorLimit) &&
4526                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4527                    //  It's a new-line.  ^ is true.  Success.
4528                    //  TODO:  what should be done with positions between a CR and LF?
4529                    break;
4530                }
4531                // Not at the start of a line.  Fail.
4532                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4533            }
4534            break;
4535
4536
4537        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4538            {
4539                U_ASSERT(fp->fInputIdx >= fAnchorStart);
4540                if (fp->fInputIdx <= fAnchorStart) {
4541                    // We are at the start input.  Success.
4542                    break;
4543                }
4544                // Check whether character just before the current pos is a new-line
4545                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4546                UChar  c = inputBuf[fp->fInputIdx - 1];
4547                if (c != 0x0a) {
4548                    // Not at the start of a line.  Back-track out.
4549                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4550                }
4551            }
4552            break;
4553
4554        case URX_BACKSLASH_B:          // Test for word boundaries
4555            {
4556                UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4557                success ^= (UBool)(opValue != 0);     // flip sense for \B
4558                if (!success) {
4559                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4560                }
4561            }
4562            break;
4563
4564
4565        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4566            {
4567                UBool success = isUWordBoundary(fp->fInputIdx);
4568                success ^= (UBool)(opValue != 0);     // flip sense for \B
4569                if (!success) {
4570                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4571                }
4572            }
4573            break;
4574
4575
4576        case URX_BACKSLASH_D:            // Test for decimal digit
4577            {
4578                if (fp->fInputIdx >= fActiveLimit) {
4579                    fHitEnd = TRUE;
4580                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4581                    break;
4582                }
4583
4584                UChar32 c;
4585                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4586                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4587                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4588                success ^= (UBool)(opValue != 0);        // flip sense for \D
4589                if (!success) {
4590                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4591                }
4592            }
4593            break;
4594
4595
4596        case URX_BACKSLASH_G:          // Test for position at end of previous match
4597            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4598                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4599            }
4600            break;
4601
4602
4603        case URX_BACKSLASH_X:
4604        //  Match a Grapheme, as defined by Unicode TR 29.
4605        //  Differs slightly from Perl, which consumes combining marks independently
4606        //    of context.
4607        {
4608
4609            // Fail if at end of input
4610            if (fp->fInputIdx >= fActiveLimit) {
4611                fHitEnd = TRUE;
4612                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4613                break;
4614            }
4615
4616            // Examine (and consume) the current char.
4617            //   Dispatch into a little state machine, based on the char.
4618            UChar32  c;
4619            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4620            UnicodeSet **sets = fPattern->fStaticSets;
4621            if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4622            if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4623            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4624            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4625            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4626            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4627            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4628            goto GC_Extend;
4629
4630
4631
4632GC_L:
4633            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4634            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4635            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4636            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4637            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4638            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4639            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4640            goto GC_Extend;
4641
4642GC_V:
4643            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4644            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4645            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4646            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4647            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4648            goto GC_Extend;
4649
4650GC_T:
4651            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4652            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4653            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4654            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4655            goto GC_Extend;
4656
4657GC_Extend:
4658            // Combining characters are consumed here
4659            for (;;) {
4660                if (fp->fInputIdx >= fActiveLimit) {
4661                    break;
4662                }
4663                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4664                if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4665                    U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4666                    break;
4667                }
4668            }
4669            goto GC_Done;
4670
4671GC_Control:
4672            // Most control chars stand alone (don't combine with combining chars),
4673            //   except for that CR/LF sequence is a single grapheme cluster.
4674            if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4675                fp->fInputIdx++;
4676            }
4677
4678GC_Done:
4679            if (fp->fInputIdx >= fActiveLimit) {
4680                fHitEnd = TRUE;
4681            }
4682            break;
4683        }
4684
4685
4686
4687
4688        case URX_BACKSLASH_Z:          // Test for end of Input
4689            if (fp->fInputIdx < fAnchorLimit) {
4690                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4691            } else {
4692                fHitEnd = TRUE;
4693                fRequireEnd = TRUE;
4694            }
4695            break;
4696
4697
4698
4699        case URX_STATIC_SETREF:
4700            {
4701                // Test input character against one of the predefined sets
4702                //    (Word Characters, for example)
4703                // The high bit of the op value is a flag for the match polarity.
4704                //    0:   success if input char is in set.
4705                //    1:   success if input char is not in set.
4706                if (fp->fInputIdx >= fActiveLimit) {
4707                    fHitEnd = TRUE;
4708                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4709                    break;
4710                }
4711
4712                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4713                opValue &= ~URX_NEG_SET;
4714                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4715
4716                UChar32 c;
4717                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4718                if (c < 256) {
4719                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4720                    if (s8->contains(c)) {
4721                        success = !success;
4722                    }
4723                } else {
4724                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
4725                    if (s->contains(c)) {
4726                        success = !success;
4727                    }
4728                }
4729                if (!success) {
4730                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4731                }
4732            }
4733            break;
4734
4735
4736        case URX_STAT_SETREF_N:
4737            {
4738                // Test input character for NOT being a member of  one of
4739                //    the predefined sets (Word Characters, for example)
4740                if (fp->fInputIdx >= fActiveLimit) {
4741                    fHitEnd = TRUE;
4742                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4743                    break;
4744                }
4745
4746                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4747
4748                UChar32  c;
4749                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4750                if (c < 256) {
4751                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4752                    if (s8->contains(c) == FALSE) {
4753                        break;
4754                    }
4755                } else {
4756                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
4757                    if (s->contains(c) == FALSE) {
4758                        break;
4759                    }
4760                }
4761                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4762            }
4763            break;
4764
4765
4766        case URX_SETREF:
4767            {
4768                if (fp->fInputIdx >= fActiveLimit) {
4769                    fHitEnd = TRUE;
4770                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4771                    break;
4772                }
4773
4774                U_ASSERT(opValue > 0 && opValue < sets->size());
4775
4776                // There is input left.  Pick up one char and test it for set membership.
4777                UChar32  c;
4778                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4779                if (c<256) {
4780                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4781                    if (s8->contains(c)) {
4782                        // The character is in the set.  A Match.
4783                        break;
4784                    }
4785                } else {
4786                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4787                    if (s->contains(c)) {
4788                        // The character is in the set.  A Match.
4789                        break;
4790                    }
4791                }
4792
4793                // the character wasn't in the set.
4794                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4795            }
4796            break;
4797
4798
4799        case URX_DOTANY:
4800            {
4801                // . matches anything, but stops at end-of-line.
4802                if (fp->fInputIdx >= fActiveLimit) {
4803                    // At end of input.  Match failed.  Backtrack out.
4804                    fHitEnd = TRUE;
4805                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4806                    break;
4807                }
4808
4809                // There is input left.  Advance over one char, unless we've hit end-of-line
4810                UChar32  c;
4811                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4812                if (((c & 0x7f) <= 0x29) &&     // First quickly bypass as many chars as possible
4813                    ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) {
4814                    // End of line in normal mode.   . does not match.
4815                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4816                    break;
4817                }
4818            }
4819            break;
4820
4821
4822        case URX_DOTANY_ALL:
4823            {
4824                // . in dot-matches-all (including new lines) mode
4825                if (fp->fInputIdx >= fActiveLimit) {
4826                    // At end of input.  Match failed.  Backtrack out.
4827                    fHitEnd = TRUE;
4828                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4829                    break;
4830                }
4831
4832                // There is input left.  Advance over one char, except if we are
4833                //   at a cr/lf, advance over both of them.
4834                UChar32 c;
4835                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4836                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4837                    // In the case of a CR/LF, we need to advance over both.
4838                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4839                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4840                    }
4841                }
4842            }
4843            break;
4844
4845
4846        case URX_DOTANY_UNIX:
4847            {
4848                // '.' operator, matches all, but stops at end-of-line.
4849                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
4850                if (fp->fInputIdx >= fActiveLimit) {
4851                    // At end of input.  Match failed.  Backtrack out.
4852                    fHitEnd = TRUE;
4853                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4854                    break;
4855                }
4856
4857                // There is input left.  Advance over one char, unless we've hit end-of-line
4858                UChar32 c;
4859                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4860                if (c == 0x0a) {
4861                    // End of line in normal mode.   '.' does not match the \n
4862                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4863                }
4864            }
4865            break;
4866
4867
4868        case URX_JMP:
4869            fp->fPatIdx = opValue;
4870            break;
4871
4872        case URX_FAIL:
4873            isMatch = FALSE;
4874            goto breakFromLoop;
4875
4876        case URX_JMP_SAV:
4877            U_ASSERT(opValue < fPattern->fCompiledPat->size());
4878            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
4879            fp->fPatIdx = opValue;                         // Then JMP.
4880            break;
4881
4882        case URX_JMP_SAV_X:
4883            // This opcode is used with (x)+, when x can match a zero length string.
4884            // Same as JMP_SAV, except conditional on the match having made forward progress.
4885            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4886            //   data address of the input position at the start of the loop.
4887            {
4888                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4889                int32_t  stoOp = (int32_t)pat[opValue-1];
4890                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4891                int32_t  frameLoc = URX_VAL(stoOp);
4892                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4893                int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4894                U_ASSERT(prevInputIdx <= fp->fInputIdx);
4895                if (prevInputIdx < fp->fInputIdx) {
4896                    // The match did make progress.  Repeat the loop.
4897                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
4898                    fp->fPatIdx = opValue;
4899                    fp->fExtra[frameLoc] = fp->fInputIdx;
4900                }
4901                // If the input position did not advance, we do nothing here,
4902                //   execution will fall out of the loop.
4903            }
4904            break;
4905
4906        case URX_CTR_INIT:
4907            {
4908                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4909                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
4910
4911                // Pick up the three extra operands that CTR_INIT has, and
4912                //    skip the pattern location counter past
4913                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4914                fp->fPatIdx += 3;
4915                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
4916                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4917                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4918                U_ASSERT(minCount>=0);
4919                U_ASSERT(maxCount>=minCount || maxCount==-1);
4920                U_ASSERT(loopLoc>=fp->fPatIdx);
4921
4922                if (minCount == 0) {
4923                    fp = StateSave(fp, loopLoc+1, status);
4924                }
4925                if (maxCount == -1) {
4926                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
4927                } else if (maxCount == 0) {
4928                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4929                }
4930            }
4931            break;
4932
4933        case URX_CTR_LOOP:
4934            {
4935                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4936                int32_t initOp = (int32_t)pat[opValue];
4937                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4938                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4939                int32_t minCount  = (int32_t)pat[opValue+2];
4940                int32_t maxCount  = (int32_t)pat[opValue+3];
4941                (*pCounter)++;
4942                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
4943                    U_ASSERT(*pCounter == maxCount);
4944                    break;
4945                }
4946                if (*pCounter >= minCount) {
4947                    if (maxCount == -1) {
4948                        // Loop has no hard upper bound.
4949                        // Check that it is progressing through the input, break if it is not.
4950                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
4951                        if (fp->fInputIdx == *pLastInputIdx) {
4952                            break;
4953                        } else {
4954                            *pLastInputIdx = fp->fInputIdx;
4955                        }
4956                    }
4957                    fp = StateSave(fp, fp->fPatIdx, status);
4958                }
4959                fp->fPatIdx = opValue + 4;    // Loop back.
4960            }
4961            break;
4962
4963        case URX_CTR_INIT_NG:
4964            {
4965                // Initialize a non-greedy loop
4966                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4967                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
4968
4969                // Pick up the three extra operands that CTR_INIT_NG has, and
4970                //    skip the pattern location counter past
4971                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4972                fp->fPatIdx += 3;
4973                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
4974                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4975                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4976                U_ASSERT(minCount>=0);
4977                U_ASSERT(maxCount>=minCount || maxCount==-1);
4978                U_ASSERT(loopLoc>fp->fPatIdx);
4979                if (maxCount == -1) {
4980                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
4981                }
4982
4983                if (minCount == 0) {
4984                    if (maxCount != 0) {
4985                        fp = StateSave(fp, fp->fPatIdx, status);
4986                    }
4987                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
4988                }
4989            }
4990            break;
4991
4992        case URX_CTR_LOOP_NG:
4993            {
4994                // Non-greedy {min, max} loops
4995                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4996                int32_t initOp = (int32_t)pat[opValue];
4997                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
4998                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4999                int32_t minCount  = (int32_t)pat[opValue+2];
5000                int32_t maxCount  = (int32_t)pat[opValue+3];
5001
5002                (*pCounter)++;
5003                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5004                    // The loop has matched the maximum permitted number of times.
5005                    //   Break out of here with no action.  Matching will
5006                    //   continue with the following pattern.
5007                    U_ASSERT(*pCounter == maxCount);
5008                    break;
5009                }
5010
5011                if (*pCounter < minCount) {
5012                    // We haven't met the minimum number of matches yet.
5013                    //   Loop back for another one.
5014                    fp->fPatIdx = opValue + 4;    // Loop back.
5015                } else {
5016                    // We do have the minimum number of matches.
5017
5018                    // If there is no upper bound on the loop iterations, check that the input index
5019                    // is progressing, and stop the loop if it is not.
5020                    if (maxCount == -1) {
5021                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5022                        if (fp->fInputIdx == *pLastInputIdx) {
5023                            break;
5024                        }
5025                        *pLastInputIdx = fp->fInputIdx;
5026                    }
5027
5028                    // Loop Continuation: we will fall into the pattern following the loop
5029                    //   (non-greedy, don't execute loop body first), but first do
5030                    //   a state save to the top of the loop, so that a match failure
5031                    //   in the following pattern will try another iteration of the loop.
5032                    fp = StateSave(fp, opValue + 4, status);
5033                }
5034            }
5035            break;
5036
5037        case URX_STO_SP:
5038            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5039            fData[opValue] = fStack->size();
5040            break;
5041
5042        case URX_LD_SP:
5043            {
5044                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5045                int32_t newStackSize = (int32_t)fData[opValue];
5046                U_ASSERT(newStackSize <= fStack->size());
5047                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5048                if (newFP == (int64_t *)fp) {
5049                    break;
5050                }
5051                int32_t i;
5052                for (i=0; i<fFrameSize; i++) {
5053                    newFP[i] = ((int64_t *)fp)[i];
5054                }
5055                fp = (REStackFrame *)newFP;
5056                fStack->setSize(newStackSize);
5057            }
5058            break;
5059
5060        case URX_BACKREF:
5061            {
5062                U_ASSERT(opValue < fFrameSize);
5063                int64_t groupStartIdx = fp->fExtra[opValue];
5064                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5065                U_ASSERT(groupStartIdx <= groupEndIdx);
5066                int64_t inputIndex = fp->fInputIdx;
5067                if (groupStartIdx < 0) {
5068                    // This capture group has not participated in the match thus far,
5069                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5070                    break;
5071                }
5072                UBool success = TRUE;
5073                for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5074                    if (inputIndex >= fActiveLimit) {
5075                        success = FALSE;
5076                        fHitEnd = TRUE;
5077                        break;
5078                    }
5079                    if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5080                        success = FALSE;
5081                        break;
5082                    }
5083                }
5084                if (success) {
5085                    fp->fInputIdx = inputIndex;
5086                } else {
5087                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5088                }
5089            }
5090            break;
5091
5092        case URX_BACKREF_I:
5093            {
5094                U_ASSERT(opValue < fFrameSize);
5095                int64_t groupStartIdx = fp->fExtra[opValue];
5096                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5097                U_ASSERT(groupStartIdx <= groupEndIdx);
5098                if (groupStartIdx < 0) {
5099                    // This capture group has not participated in the match thus far,
5100                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5101                    break;
5102                }
5103                CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5104                CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5105
5106                //   Note: if the capture group match was of an empty string the backref
5107                //         match succeeds.  Verified by testing:  Perl matches succeed
5108                //         in this case, so we do too.
5109
5110                UBool success = TRUE;
5111                for (;;) {
5112                    UChar32 captureGroupChar = captureGroupItr.next();
5113                    if (captureGroupChar == U_SENTINEL) {
5114                        success = TRUE;
5115                        break;
5116                    }
5117                    UChar32 inputChar = inputItr.next();
5118                    if (inputChar == U_SENTINEL) {
5119                        success = FALSE;
5120                        fHitEnd = TRUE;
5121                        break;
5122                    }
5123                    if (inputChar != captureGroupChar) {
5124                        success = FALSE;
5125                        break;
5126                    }
5127                }
5128
5129                if (success && inputItr.inExpansion()) {
5130                    // We otained a match by consuming part of a string obtained from
5131                    // case-folding a single code point of the input text.
5132                    // This does not count as an overall match.
5133                    success = FALSE;
5134                }
5135
5136                if (success) {
5137                    fp->fInputIdx = inputItr.getIndex();
5138                } else {
5139                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5140                }
5141            }
5142            break;
5143
5144        case URX_STO_INP_LOC:
5145            {
5146                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5147                fp->fExtra[opValue] = fp->fInputIdx;
5148            }
5149            break;
5150
5151        case URX_JMPX:
5152            {
5153                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5154                fp->fPatIdx += 1;
5155                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5156                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5157                int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5158                U_ASSERT(savedInputIdx <= fp->fInputIdx);
5159                if (savedInputIdx < fp->fInputIdx) {
5160                    fp->fPatIdx = opValue;                               // JMP
5161                } else {
5162                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5163                }
5164            }
5165            break;
5166
5167        case URX_LA_START:
5168            {
5169                // Entering a lookahead block.
5170                // Save Stack Ptr, Input Pos.
5171                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5172                fData[opValue]   = fStack->size();
5173                fData[opValue+1] = fp->fInputIdx;
5174                fActiveStart     = fLookStart;          // Set the match region change for
5175                fActiveLimit     = fLookLimit;          //   transparent bounds.
5176            }
5177            break;
5178
5179        case URX_LA_END:
5180            {
5181                // Leaving a look-ahead block.
5182                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5183                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5184                int32_t stackSize = fStack->size();
5185                int32_t newStackSize = (int32_t)fData[opValue];
5186                U_ASSERT(stackSize >= newStackSize);
5187                if (stackSize > newStackSize) {
5188                    // Copy the current top frame back to the new (cut back) top frame.
5189                    //   This makes the capture groups from within the look-ahead
5190                    //   expression available.
5191                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5192                    int32_t i;
5193                    for (i=0; i<fFrameSize; i++) {
5194                        newFP[i] = ((int64_t *)fp)[i];
5195                    }
5196                    fp = (REStackFrame *)newFP;
5197                    fStack->setSize(newStackSize);
5198                }
5199                fp->fInputIdx = fData[opValue+1];
5200
5201                // Restore the active region bounds in the input string; they may have
5202                //    been changed because of transparent bounds on a Region.
5203                fActiveStart = fRegionStart;
5204                fActiveLimit = fRegionLimit;
5205            }
5206            break;
5207
5208        case URX_ONECHAR_I:
5209            if (fp->fInputIdx < fActiveLimit) {
5210                UChar32 c;
5211                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5212                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5213                    break;
5214                }
5215            } else {
5216                fHitEnd = TRUE;
5217            }
5218            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5219            break;
5220
5221        case URX_STRING_I:
5222            // Case-insensitive test input against a literal string.
5223            // Strings require two slots in the compiled pattern, one for the
5224            //   offset to the string text, and one for the length.
5225            //   The compiled string has already been case folded.
5226            {
5227                const UChar *patternString = litText + opValue;
5228
5229                op      = (int32_t)pat[fp->fPatIdx];
5230                fp->fPatIdx++;
5231                opType  = URX_TYPE(op);
5232                opValue = URX_VAL(op);
5233                U_ASSERT(opType == URX_STRING_LEN);
5234                int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5235
5236                UChar32      cText;
5237                UChar32      cPattern;
5238                UBool        success = TRUE;
5239                int32_t      patternStringIdx  = 0;
5240                CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5241                while (patternStringIdx < patternStringLen) {
5242                    U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5243                    cText = inputIterator.next();
5244                    if (cText != cPattern) {
5245                        success = FALSE;
5246                        if (cText == U_SENTINEL) {
5247                            fHitEnd = TRUE;
5248                        }
5249                        break;
5250                    }
5251                }
5252                if (inputIterator.inExpansion()) {
5253                    success = FALSE;
5254                }
5255
5256                if (success) {
5257                    fp->fInputIdx = inputIterator.getIndex();
5258                } else {
5259                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5260                }
5261            }
5262            break;
5263
5264        case URX_LB_START:
5265            {
5266                // Entering a look-behind block.
5267                // Save Stack Ptr, Input Pos.
5268                //   TODO:  implement transparent bounds.  Ticket #6067
5269                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5270                fData[opValue]   = fStack->size();
5271                fData[opValue+1] = fp->fInputIdx;
5272                // Init the variable containing the start index for attempted matches.
5273                fData[opValue+2] = -1;
5274                // Save input string length, then reset to pin any matches to end at
5275                //   the current position.
5276                fData[opValue+3] = fActiveLimit;
5277                fActiveLimit     = fp->fInputIdx;
5278            }
5279            break;
5280
5281
5282        case URX_LB_CONT:
5283            {
5284                // Positive Look-Behind, at top of loop checking for matches of LB expression
5285                //    at all possible input starting positions.
5286
5287                // Fetch the min and max possible match lengths.  They are the operands
5288                //   of this op in the pattern.
5289                int32_t minML = (int32_t)pat[fp->fPatIdx++];
5290                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5291                U_ASSERT(minML <= maxML);
5292                U_ASSERT(minML >= 0);
5293
5294                // Fetch (from data) the last input index where a match was attempted.
5295                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5296                int64_t  *lbStartIdx = &fData[opValue+2];
5297                if (*lbStartIdx < 0) {
5298                    // First time through loop.
5299                    *lbStartIdx = fp->fInputIdx - minML;
5300                } else {
5301                    // 2nd through nth time through the loop.
5302                    // Back up start position for match by one.
5303                    if (*lbStartIdx == 0) {
5304                        (*lbStartIdx)--;
5305                    } else {
5306                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5307                    }
5308                }
5309
5310                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5311                    // We have tried all potential match starting points without
5312                    //  getting a match.  Backtrack out, and out of the
5313                    //   Look Behind altogether.
5314                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5315                    int64_t restoreInputLen = fData[opValue+3];
5316                    U_ASSERT(restoreInputLen >= fActiveLimit);
5317                    U_ASSERT(restoreInputLen <= fInputLength);
5318                    fActiveLimit = restoreInputLen;
5319                    break;
5320                }
5321
5322                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5323                //      (successful match will fall off the end of the loop.)
5324                fp = StateSave(fp, fp->fPatIdx-3, status);
5325                fp->fInputIdx =  *lbStartIdx;
5326            }
5327            break;
5328
5329        case URX_LB_END:
5330            // End of a look-behind block, after a successful match.
5331            {
5332                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5333                if (fp->fInputIdx != fActiveLimit) {
5334                    //  The look-behind expression matched, but the match did not
5335                    //    extend all the way to the point that we are looking behind from.
5336                    //  FAIL out of here, which will take us back to the LB_CONT, which
5337                    //     will retry the match starting at another position or fail
5338                    //     the look-behind altogether, whichever is appropriate.
5339                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5340                    break;
5341                }
5342
5343                // Look-behind match is good.  Restore the orignal input string length,
5344                //   which had been truncated to pin the end of the lookbehind match to the
5345                //   position being looked-behind.
5346                int64_t originalInputLen = fData[opValue+3];
5347                U_ASSERT(originalInputLen >= fActiveLimit);
5348                U_ASSERT(originalInputLen <= fInputLength);
5349                fActiveLimit = originalInputLen;
5350            }
5351            break;
5352
5353
5354        case URX_LBN_CONT:
5355            {
5356                // Negative Look-Behind, at top of loop checking for matches of LB expression
5357                //    at all possible input starting positions.
5358
5359                // Fetch the extra parameters of this op.
5360                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5361                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5362                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5363                continueLoc = URX_VAL(continueLoc);
5364                U_ASSERT(minML <= maxML);
5365                U_ASSERT(minML >= 0);
5366                U_ASSERT(continueLoc > fp->fPatIdx);
5367
5368                // Fetch (from data) the last input index where a match was attempted.
5369                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5370                int64_t  *lbStartIdx = &fData[opValue+2];
5371                if (*lbStartIdx < 0) {
5372                    // First time through loop.
5373                    *lbStartIdx = fp->fInputIdx - minML;
5374                } else {
5375                    // 2nd through nth time through the loop.
5376                    // Back up start position for match by one.
5377                    if (*lbStartIdx == 0) {
5378                        (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5379                    } else {
5380                        U16_BACK_1(inputBuf, 0, *lbStartIdx);
5381                    }
5382                }
5383
5384                if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5385                    // We have tried all potential match starting points without
5386                    //  getting a match, which means that the negative lookbehind as
5387                    //  a whole has succeeded.  Jump forward to the continue location
5388                    int64_t restoreInputLen = fData[opValue+3];
5389                    U_ASSERT(restoreInputLen >= fActiveLimit);
5390                    U_ASSERT(restoreInputLen <= fInputLength);
5391                    fActiveLimit = restoreInputLen;
5392                    fp->fPatIdx = continueLoc;
5393                    break;
5394                }
5395
5396                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5397                //      (successful match will cause a FAIL out of the loop altogether.)
5398                fp = StateSave(fp, fp->fPatIdx-4, status);
5399                fp->fInputIdx =  *lbStartIdx;
5400            }
5401            break;
5402
5403        case URX_LBN_END:
5404            // End of a negative look-behind block, after a successful match.
5405            {
5406                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5407                if (fp->fInputIdx != fActiveLimit) {
5408                    //  The look-behind expression matched, but the match did not
5409                    //    extend all the way to the point that we are looking behind from.
5410                    //  FAIL out of here, which will take us back to the LB_CONT, which
5411                    //     will retry the match starting at another position or succeed
5412                    //     the look-behind altogether, whichever is appropriate.
5413                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5414                    break;
5415                }
5416
5417                // Look-behind expression matched, which means look-behind test as
5418                //   a whole Fails
5419
5420                //   Restore the orignal input string length, which had been truncated
5421                //   inorder to pin the end of the lookbehind match
5422                //   to the position being looked-behind.
5423                int64_t originalInputLen = fData[opValue+3];
5424                U_ASSERT(originalInputLen >= fActiveLimit);
5425                U_ASSERT(originalInputLen <= fInputLength);
5426                fActiveLimit = originalInputLen;
5427
5428                // Restore original stack position, discarding any state saved
5429                //   by the successful pattern match.
5430                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5431                int32_t newStackSize = (int32_t)fData[opValue];
5432                U_ASSERT(fStack->size() > newStackSize);
5433                fStack->setSize(newStackSize);
5434
5435                //  FAIL, which will take control back to someplace
5436                //  prior to entering the look-behind test.
5437                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5438            }
5439            break;
5440
5441
5442        case URX_LOOP_SR_I:
5443            // Loop Initialization for the optimized implementation of
5444            //     [some character set]*
5445            //   This op scans through all matching input.
5446            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5447            {
5448                U_ASSERT(opValue > 0 && opValue < sets->size());
5449                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5450                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5451
5452                // Loop through input, until either the input is exhausted or
5453                //   we reach a character that is not a member of the set.
5454                int32_t ix = (int32_t)fp->fInputIdx;
5455                for (;;) {
5456                    if (ix >= fActiveLimit) {
5457                        fHitEnd = TRUE;
5458                        break;
5459                    }
5460                    UChar32   c;
5461                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
5462                    if (c<256) {
5463                        if (s8->contains(c) == FALSE) {
5464                            U16_BACK_1(inputBuf, 0, ix);
5465                            break;
5466                        }
5467                    } else {
5468                        if (s->contains(c) == FALSE) {
5469                            U16_BACK_1(inputBuf, 0, ix);
5470                            break;
5471                        }
5472                    }
5473                }
5474
5475                // If there were no matching characters, skip over the loop altogether.
5476                //   The loop doesn't run at all, a * op always succeeds.
5477                if (ix == fp->fInputIdx) {
5478                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5479                    break;
5480                }
5481
5482                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5483                //   must follow.  It's operand is the stack location
5484                //   that holds the starting input index for the match of this [set]*
5485                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5486                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5487                int32_t stackLoc = URX_VAL(loopcOp);
5488                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5489                fp->fExtra[stackLoc] = fp->fInputIdx;
5490                fp->fInputIdx = ix;
5491
5492                // Save State to the URX_LOOP_C op that follows this one,
5493                //   so that match failures in the following code will return to there.
5494                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5495                fp = StateSave(fp, fp->fPatIdx, status);
5496                fp->fPatIdx++;
5497            }
5498            break;
5499
5500
5501        case URX_LOOP_DOT_I:
5502            // Loop Initialization for the optimized implementation of .*
5503            //   This op scans through all remaining input.
5504            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5505            {
5506                // Loop through input until the input is exhausted (we reach an end-of-line)
5507                // In DOTALL mode, we can just go straight to the end of the input.
5508                int32_t ix;
5509                if ((opValue & 1) == 1) {
5510                    // Dot-matches-All mode.  Jump straight to the end of the string.
5511                    ix = (int32_t)fActiveLimit;
5512                    fHitEnd = TRUE;
5513                } else {
5514                    // NOT DOT ALL mode.  Line endings do not match '.'
5515                    // Scan forward until a line ending or end of input.
5516                    ix = (int32_t)fp->fInputIdx;
5517                    for (;;) {
5518                        if (ix >= fActiveLimit) {
5519                            fHitEnd = TRUE;
5520                            break;
5521                        }
5522                        UChar32   c;
5523                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5524                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5525                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5526                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5527                                   ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) {
5528                                //  char is a line ending.  Put the input pos back to the
5529                                //    line ending char, and exit the scanning loop.
5530                                U16_BACK_1(inputBuf, 0, ix);
5531                                break;
5532                            }
5533                        }
5534                    }
5535                }
5536
5537                // If there were no matching characters, skip over the loop altogether.
5538                //   The loop doesn't run at all, a * op always succeeds.
5539                if (ix == fp->fInputIdx) {
5540                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5541                    break;
5542                }
5543
5544                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5545                //   must follow.  It's operand is the stack location
5546                //   that holds the starting input index for the match of this .*
5547                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5548                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5549                int32_t stackLoc = URX_VAL(loopcOp);
5550                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5551                fp->fExtra[stackLoc] = fp->fInputIdx;
5552                fp->fInputIdx = ix;
5553
5554                // Save State to the URX_LOOP_C op that follows this one,
5555                //   so that match failures in the following code will return to there.
5556                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5557                fp = StateSave(fp, fp->fPatIdx, status);
5558                fp->fPatIdx++;
5559            }
5560            break;
5561
5562
5563        case URX_LOOP_C:
5564            {
5565                U_ASSERT(opValue>=0 && opValue<fFrameSize);
5566                backSearchIndex = (int32_t)fp->fExtra[opValue];
5567                U_ASSERT(backSearchIndex <= fp->fInputIdx);
5568                if (backSearchIndex == fp->fInputIdx) {
5569                    // We've backed up the input idx to the point that the loop started.
5570                    // The loop is done.  Leave here without saving state.
5571                    //  Subsequent failures won't come back here.
5572                    break;
5573                }
5574                // Set up for the next iteration of the loop, with input index
5575                //   backed up by one from the last time through,
5576                //   and a state save to this instruction in case the following code fails again.
5577                //   (We're going backwards because this loop emulates stack unwinding, not
5578                //    the initial scan forward.)
5579                U_ASSERT(fp->fInputIdx > 0);
5580                UChar32 prevC;
5581                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5582
5583                if (prevC == 0x0a &&
5584                    fp->fInputIdx > backSearchIndex &&
5585                    inputBuf[fp->fInputIdx-1] == 0x0d) {
5586                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5587                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5588                        // .*, stepping back over CRLF pair.
5589                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5590                    }
5591                }
5592
5593
5594                fp = StateSave(fp, fp->fPatIdx-1, status);
5595            }
5596            break;
5597
5598
5599
5600        default:
5601            // Trouble.  The compiled pattern contains an entry with an
5602            //           unrecognized type tag.
5603            U_ASSERT(FALSE);
5604        }
5605
5606        if (U_FAILURE(status)) {
5607            isMatch = FALSE;
5608            break;
5609        }
5610    }
5611
5612breakFromLoop:
5613    fMatch = isMatch;
5614    if (isMatch) {
5615        fLastMatchEnd = fMatchEnd;
5616        fMatchStart   = startIdx;
5617        fMatchEnd     = fp->fInputIdx;
5618    }
5619
5620#ifdef REGEX_RUN_DEBUG
5621    if (fTraceDebug) {
5622        if (isMatch) {
5623            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5624        } else {
5625            printf("No match\n\n");
5626        }
5627    }
5628#endif
5629
5630    fFrame = fp;                // The active stack frame when the engine stopped.
5631                                //   Contains the capture group results that we need to
5632                                //    access later.
5633
5634    return;
5635}
5636
5637
5638UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5639
5640U_NAMESPACE_END
5641
5642#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5643