1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**************************************************************************
5*   Copyright (C) 2002-2016 International Business Machines Corporation
6*   and others. All rights reserved.
7**************************************************************************
8*/
9//
10//  file:  rematch.cpp
11//
12//         Contains the implementation of class RegexMatcher,
13//         which is one of the main API classes for the ICU regular expression package.
14//
15
16#include "unicode/utypes.h"
17#if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19#include "unicode/regex.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/ustring.h"
23#include "unicode/rbbi.h"
24#include "unicode/utf.h"
25#include "unicode/utf16.h"
26#include "uassert.h"
27#include "cmemory.h"
28#include "cstr.h"
29#include "uvector.h"
30#include "uvectr32.h"
31#include "uvectr64.h"
32#include "regeximp.h"
33#include "regexst.h"
34#include "regextxt.h"
35#include "ucase.h"
36
37// #include <malloc.h>        // Needed for heapcheck testing
38
39
40U_NAMESPACE_BEGIN
41
42// Default limit for the size of the back track stack, to avoid system
43//    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
44// This value puts ICU's limits higher than most other regexp implementations,
45//    which use recursion rather than the heap, and take more storage per
46//    backtrack point.
47//
48static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50// Time limit counter constant.
51//   Time limits for expression evaluation are in terms of quanta of work by
52//   the engine, each of which is 10,000 state saves.
53//   This constant determines that state saves per tick number.
54static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57// Test for any of the Unicode line terminating characters.
58static inline UBool isLineTerminator(UChar32 c) {
59    if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60        return false;
61    }
62    return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63}
64
65//-----------------------------------------------------------------------------
66//
67//   Constructor and Destructor
68//
69//-----------------------------------------------------------------------------
70RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
71    fDeferredStatus = U_ZERO_ERROR;
72    init(fDeferredStatus);
73    if (U_FAILURE(fDeferredStatus)) {
74        return;
75    }
76    if (pat==NULL) {
77        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78        return;
79    }
80    fPattern = pat;
81    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82}
83
84
85
86RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87                           uint32_t flags, UErrorCode &status) {
88    init(status);
89    if (U_FAILURE(status)) {
90        return;
91    }
92    UParseError    pe;
93    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
94    fPattern           = fPatternOwned;
95
96    UText inputText = UTEXT_INITIALIZER;
97    utext_openConstUnicodeString(&inputText, &input, &status);
98    init2(&inputText, status);
99    utext_close(&inputText);
100
101    fInputUniStrMaybeMutable = TRUE;
102}
103
104
105RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106                           uint32_t flags, UErrorCode &status) {
107    init(status);
108    if (U_FAILURE(status)) {
109        return;
110    }
111    UParseError    pe;
112    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
113    if (U_FAILURE(status)) {
114        return;
115    }
116
117    fPattern           = fPatternOwned;
118    init2(input, status);
119}
120
121
122RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123                           uint32_t flags, UErrorCode &status) {
124    init(status);
125    if (U_FAILURE(status)) {
126        return;
127    }
128    UParseError    pe;
129    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
130    if (U_FAILURE(status)) {
131        return;
132    }
133    fPattern           = fPatternOwned;
134    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135}
136
137RegexMatcher::RegexMatcher(UText *regexp,
138                           uint32_t flags, UErrorCode &status) {
139    init(status);
140    if (U_FAILURE(status)) {
141        return;
142    }
143    UParseError    pe;
144    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
145        if (U_FAILURE(status)) {
146        return;
147    }
148
149    fPattern           = fPatternOwned;
150    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151}
152
153
154
155
156RegexMatcher::~RegexMatcher() {
157    delete fStack;
158    if (fData != fSmallData) {
159        uprv_free(fData);
160        fData = NULL;
161    }
162    if (fPatternOwned) {
163        delete fPatternOwned;
164        fPatternOwned = NULL;
165        fPattern = NULL;
166    }
167
168    if (fInput) {
169        delete fInput;
170    }
171    if (fInputText) {
172        utext_close(fInputText);
173    }
174    if (fAltInputText) {
175        utext_close(fAltInputText);
176    }
177
178    #if UCONFIG_NO_BREAK_ITERATION==0
179    delete fWordBreakItr;
180    #endif
181}
182
183//
184//   init()   common initialization for use by all constructors.
185//            Initialize all fields, get the object into a consistent state.
186//            This must be done even when the initial status shows an error,
187//            so that the object is initialized sufficiently well for the destructor
188//            to run safely.
189//
190void RegexMatcher::init(UErrorCode &status) {
191    fPattern           = NULL;
192    fPatternOwned      = NULL;
193    fFrameSize         = 0;
194    fRegionStart       = 0;
195    fRegionLimit       = 0;
196    fAnchorStart       = 0;
197    fAnchorLimit       = 0;
198    fLookStart         = 0;
199    fLookLimit         = 0;
200    fActiveStart       = 0;
201    fActiveLimit       = 0;
202    fTransparentBounds = FALSE;
203    fAnchoringBounds   = TRUE;
204    fMatch             = FALSE;
205    fMatchStart        = 0;
206    fMatchEnd          = 0;
207    fLastMatchEnd      = -1;
208    fAppendPosition    = 0;
209    fHitEnd            = FALSE;
210    fRequireEnd        = FALSE;
211    fStack             = NULL;
212    fFrame             = NULL;
213    fTimeLimit         = 0;
214    fTime              = 0;
215    fTickCounter       = 0;
216    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
217    fCallbackFn        = NULL;
218    fCallbackContext   = NULL;
219    fFindProgressCallbackFn      = NULL;
220    fFindProgressCallbackContext = NULL;
221    fTraceDebug        = FALSE;
222    fDeferredStatus    = status;
223    fData              = fSmallData;
224    fWordBreakItr      = NULL;
225
226    fStack             = NULL;
227    fInputText         = NULL;
228    fAltInputText      = NULL;
229    fInput             = NULL;
230    fInputLength       = 0;
231    fInputUniStrMaybeMutable = FALSE;
232}
233
234//
235//  init2()   Common initialization for use by RegexMatcher constructors, part 2.
236//            This handles the common setup to be done after the Pattern is available.
237//
238void RegexMatcher::init2(UText *input, UErrorCode &status) {
239    if (U_FAILURE(status)) {
240        fDeferredStatus = status;
241        return;
242    }
243
244    if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245        fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246        if (fData == NULL) {
247            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248            return;
249        }
250    }
251
252    fStack = new UVector64(status);
253    if (fStack == NULL) {
254        status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255        return;
256    }
257
258    reset(input);
259    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260    if (U_FAILURE(status)) {
261        fDeferredStatus = status;
262        return;
263    }
264}
265
266
267static const UChar BACKSLASH  = 0x5c;
268static const UChar DOLLARSIGN = 0x24;
269static const UChar LEFTBRACKET = 0x7b;
270static const UChar RIGHTBRACKET = 0x7d;
271
272//--------------------------------------------------------------------------------
273//
274//    appendReplacement
275//
276//--------------------------------------------------------------------------------
277RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278                                              const UnicodeString &replacement,
279                                              UErrorCode &status) {
280    UText replacementText = UTEXT_INITIALIZER;
281
282    utext_openConstUnicodeString(&replacementText, &replacement, &status);
283    if (U_SUCCESS(status)) {
284        UText resultText = UTEXT_INITIALIZER;
285        utext_openUnicodeString(&resultText, &dest, &status);
286
287        if (U_SUCCESS(status)) {
288            appendReplacement(&resultText, &replacementText, status);
289            utext_close(&resultText);
290        }
291        utext_close(&replacementText);
292    }
293
294    return *this;
295}
296
297//
298//    appendReplacement, UText mode
299//
300RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301                                              UText *replacement,
302                                              UErrorCode &status) {
303    if (U_FAILURE(status)) {
304        return *this;
305    }
306    if (U_FAILURE(fDeferredStatus)) {
307        status = fDeferredStatus;
308        return *this;
309    }
310    if (fMatch == FALSE) {
311        status = U_REGEX_INVALID_STATE;
312        return *this;
313    }
314
315    // Copy input string from the end of previous match to start of current match
316    int64_t  destLen = utext_nativeLength(dest);
317    if (fMatchStart > fAppendPosition) {
318        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320                                     (int32_t)(fMatchStart-fAppendPosition), &status);
321        } else {
322            int32_t len16;
323            if (UTEXT_USES_U16(fInputText)) {
324                len16 = (int32_t)(fMatchStart-fAppendPosition);
325            } else {
326                UErrorCode lengthStatus = U_ZERO_ERROR;
327                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328            }
329            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330            if (inputChars == NULL) {
331                status = U_MEMORY_ALLOCATION_ERROR;
332                return *this;
333            }
334            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336            uprv_free(inputChars);
337        }
338    }
339    fAppendPosition = fMatchEnd;
340
341
342    // scan the replacement text, looking for substitutions ($n) and \escapes.
343    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
344    //         move entire ranges not containing substitutions.
345    UTEXT_SETNATIVEINDEX(replacement, 0);
346    for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
347        if (c == BACKSLASH) {
348            // Backslash Escape.  Copy the following char out without further checks.
349            //                    Note:  Surrogate pairs don't need any special handling
350            //                           The second half wont be a '$' or a '\', and
351            //                           will move to the dest normally on the next
352            //                           loop iteration.
353            c = UTEXT_CURRENT32(replacement);
354            if (c == U_SENTINEL) {
355                break;
356            }
357
358            if (c==0x55/*U*/ || c==0x75/*u*/) {
359                // We have a \udddd or \Udddddddd escape sequence.
360                int32_t offset = 0;
361                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363                if (escapedChar != (UChar32)0xFFFFFFFF) {
364                    if (U_IS_BMP(escapedChar)) {
365                        UChar c16 = (UChar)escapedChar;
366                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367                    } else {
368                        UChar surrogate[2];
369                        surrogate[0] = U16_LEAD(escapedChar);
370                        surrogate[1] = U16_TRAIL(escapedChar);
371                        if (U_SUCCESS(status)) {
372                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373                        }
374                    }
375                    // TODO:  Report errors for mal-formed \u escapes?
376                    //        As this is, the original sequence is output, which may be OK.
377                    if (context.lastOffset == offset) {
378                        (void)UTEXT_PREVIOUS32(replacement);
379                    } else if (context.lastOffset != offset-1) {
380                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381                    }
382                }
383            } else {
384                (void)UTEXT_NEXT32(replacement);
385                // Plain backslash escape.  Just put out the escaped character.
386                if (U_IS_BMP(c)) {
387                    UChar c16 = (UChar)c;
388                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389                } else {
390                    UChar surrogate[2];
391                    surrogate[0] = U16_LEAD(c);
392                    surrogate[1] = U16_TRAIL(c);
393                    if (U_SUCCESS(status)) {
394                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395                    }
396                }
397            }
398        } else if (c != DOLLARSIGN) {
399            // Normal char, not a $.  Copy it out without further checks.
400            if (U_IS_BMP(c)) {
401                UChar c16 = (UChar)c;
402                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403            } else {
404                UChar surrogate[2];
405                surrogate[0] = U16_LEAD(c);
406                surrogate[1] = U16_TRAIL(c);
407                if (U_SUCCESS(status)) {
408                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409                }
410            }
411        } else {
412            // We've got a $.  Pick up a capture group name or number if one follows.
413            // Consume digits so long as the resulting group number <= the number of
414            // number of capture groups in the pattern.
415
416            int32_t groupNum  = 0;
417            int32_t numDigits = 0;
418            UChar32 nextChar = utext_current32(replacement);
419            if (nextChar == LEFTBRACKET) {
420                // Scan for a Named Capture Group, ${name}.
421                UnicodeString groupName;
422                utext_next32(replacement);
423                while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424                    nextChar = utext_next32(replacement);
425                    if (nextChar == U_SENTINEL) {
426                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427                    } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
428                               (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
429                               (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
430                        groupName.append(nextChar);
431                    } else if (nextChar == RIGHTBRACKET) {
432                        groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
433                        if (groupNum == 0) {
434                            status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435                        }
436                    } else {
437                        // Character was something other than a name char or a closing '}'
438                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439                    }
440                }
441
442            } else if (u_isdigit(nextChar)) {
443                // $n    Scan for a capture group number
444                int32_t numCaptureGroups = fPattern->fGroupMap->size();
445                for (;;) {
446                    nextChar = UTEXT_CURRENT32(replacement);
447                    if (nextChar == U_SENTINEL) {
448                        break;
449                    }
450                    if (u_isdigit(nextChar) == FALSE) {
451                        break;
452                    }
453                    int32_t nextDigitVal = u_charDigitValue(nextChar);
454                    if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455                        // Don't consume the next digit if it makes the capture group number too big.
456                        if (numDigits == 0) {
457                            status = U_INDEX_OUTOFBOUNDS_ERROR;
458                        }
459                        break;
460                    }
461                    (void)UTEXT_NEXT32(replacement);
462                    groupNum=groupNum*10 + nextDigitVal;
463                    ++numDigits;
464                }
465            } else {
466                // $ not followed by capture group name or number.
467                status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468            }
469
470            if (U_SUCCESS(status)) {
471                destLen += appendGroup(groupNum, dest, status);
472            }
473        }  // End of $ capture group handling
474    }  // End of per-character loop through the replacement string.
475
476    return *this;
477}
478
479
480
481//--------------------------------------------------------------------------------
482//
483//    appendTail     Intended to be used in conjunction with appendReplacement()
484//                   To the destination string, append everything following
485//                   the last match position from the input string.
486//
487//                   Note:  Match ranges do not affect appendTail or appendReplacement
488//
489//--------------------------------------------------------------------------------
490UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491    UErrorCode status = U_ZERO_ERROR;
492    UText resultText = UTEXT_INITIALIZER;
493    utext_openUnicodeString(&resultText, &dest, &status);
494
495    if (U_SUCCESS(status)) {
496        appendTail(&resultText, status);
497        utext_close(&resultText);
498    }
499
500    return dest;
501}
502
503//
504//   appendTail, UText mode
505//
506UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507    if (U_FAILURE(status)) {
508        return dest;
509    }
510    if (U_FAILURE(fDeferredStatus)) {
511        status = fDeferredStatus;
512        return dest;
513    }
514
515    if (fInputLength > fAppendPosition) {
516        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517            int64_t destLen = utext_nativeLength(dest);
518            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519                          (int32_t)(fInputLength-fAppendPosition), &status);
520        } else {
521            int32_t len16;
522            if (UTEXT_USES_U16(fInputText)) {
523                len16 = (int32_t)(fInputLength-fAppendPosition);
524            } else {
525                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526                status = U_ZERO_ERROR; // buffer overflow
527            }
528
529            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530            if (inputChars == NULL) {
531                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532            } else {
533                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534                int64_t destLen = utext_nativeLength(dest);
535                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536                uprv_free(inputChars);
537            }
538        }
539    }
540    return dest;
541}
542
543
544
545//--------------------------------------------------------------------------------
546//
547//   end
548//
549//--------------------------------------------------------------------------------
550int32_t RegexMatcher::end(UErrorCode &err) const {
551    return end(0, err);
552}
553
554int64_t RegexMatcher::end64(UErrorCode &err) const {
555    return end64(0, err);
556}
557
558int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559    if (U_FAILURE(err)) {
560        return -1;
561    }
562    if (fMatch == FALSE) {
563        err = U_REGEX_INVALID_STATE;
564        return -1;
565    }
566    if (group < 0 || group > fPattern->fGroupMap->size()) {
567        err = U_INDEX_OUTOFBOUNDS_ERROR;
568        return -1;
569    }
570    int64_t e = -1;
571    if (group == 0) {
572        e = fMatchEnd;
573    } else {
574        // Get the position within the stack frame of the variables for
575        //    this capture group.
576        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577        U_ASSERT(groupOffset < fPattern->fFrameSize);
578        U_ASSERT(groupOffset >= 0);
579        e = fFrame->fExtra[groupOffset + 1];
580    }
581
582        return e;
583}
584
585int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586    return (int32_t)end64(group, err);
587}
588
589//--------------------------------------------------------------------------------
590//
591//   findProgressInterrupt  This function is called once for each advance in the target
592//                          string from the find() function, and calls the user progress callback
593//                          function if there is one installed.
594//
595//         Return:  TRUE if the find operation is to be terminated.
596//                  FALSE if the find operation is to continue running.
597//
598//--------------------------------------------------------------------------------
599UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600    if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601        status = U_REGEX_STOPPED_BY_CALLER;
602        return TRUE;
603    }
604    return FALSE;
605}
606
607//--------------------------------------------------------------------------------
608//
609//   find()
610//
611//--------------------------------------------------------------------------------
612UBool RegexMatcher::find() {
613    if (U_FAILURE(fDeferredStatus)) {
614        return FALSE;
615    }
616    UErrorCode status = U_ZERO_ERROR;
617    UBool result = find(status);
618    return result;
619}
620
621//--------------------------------------------------------------------------------
622//
623//   find()
624//
625//--------------------------------------------------------------------------------
626UBool RegexMatcher::find(UErrorCode &status) {
627    // Start at the position of the last match end.  (Will be zero if the
628    //   matcher has been reset.)
629    //
630    if (U_FAILURE(status)) {
631        return FALSE;
632    }
633    if (U_FAILURE(fDeferredStatus)) {
634        status = fDeferredStatus;
635        return FALSE;
636    }
637
638    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639        return findUsingChunk(status);
640    }
641
642    int64_t startPos = fMatchEnd;
643    if (startPos==0) {
644        startPos = fActiveStart;
645    }
646
647    if (fMatch) {
648        // Save the position of any previous successful match.
649        fLastMatchEnd = fMatchEnd;
650
651        if (fMatchStart == fMatchEnd) {
652            // Previous match had zero length.  Move start position up one position
653            //  to avoid sending find() into a loop on zero-length matches.
654            if (startPos >= fActiveLimit) {
655                fMatch = FALSE;
656                fHitEnd = TRUE;
657                return FALSE;
658            }
659            UTEXT_SETNATIVEINDEX(fInputText, startPos);
660            (void)UTEXT_NEXT32(fInputText);
661            startPos = UTEXT_GETNATIVEINDEX(fInputText);
662        }
663    } else {
664        if (fLastMatchEnd >= 0) {
665            // A previous find() failed to match.  Don't try again.
666            //   (without this test, a pattern with a zero-length match
667            //    could match again at the end of an input string.)
668            fHitEnd = TRUE;
669            return FALSE;
670        }
671    }
672
673
674    // Compute the position in the input string beyond which a match can not begin, because
675    //   the minimum length match would extend past the end of the input.
676    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
677    //          Be aware of possible overflows if making changes here.
678    int64_t testStartLimit;
679    if (UTEXT_USES_U16(fInputText)) {
680        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681        if (startPos > testStartLimit) {
682            fMatch = FALSE;
683            fHitEnd = TRUE;
684            return FALSE;
685        }
686    } else {
687        // We don't know exactly how long the minimum match length is in native characters.
688        // Treat anything > 0 as 1.
689        testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690    }
691
692    UChar32  c;
693    U_ASSERT(startPos >= 0);
694
695    switch (fPattern->fStartType) {
696    case START_NO_INFO:
697        // No optimization was found.
698        //  Try a match at each input position.
699        for (;;) {
700            MatchAt(startPos, FALSE, status);
701            if (U_FAILURE(status)) {
702                return FALSE;
703            }
704            if (fMatch) {
705                return TRUE;
706            }
707            if (startPos >= testStartLimit) {
708                fHitEnd = TRUE;
709                return FALSE;
710            }
711            UTEXT_SETNATIVEINDEX(fInputText, startPos);
712            (void)UTEXT_NEXT32(fInputText);
713            startPos = UTEXT_GETNATIVEINDEX(fInputText);
714            // Note that it's perfectly OK for a pattern to have a zero-length
715            //   match at the end of a string, so we must make sure that the loop
716            //   runs with startPos == testStartLimit the last time through.
717            if  (findProgressInterrupt(startPos, status))
718                return FALSE;
719        }
720        U_ASSERT(FALSE);
721
722    case START_START:
723        // Matches are only possible at the start of the input string
724        //   (pattern begins with ^ or \A)
725        if (startPos > fActiveStart) {
726            fMatch = FALSE;
727            return FALSE;
728        }
729        MatchAt(startPos, FALSE, status);
730        if (U_FAILURE(status)) {
731            return FALSE;
732        }
733        return fMatch;
734
735
736    case START_SET:
737        {
738            // Match may start on any char from a pre-computed set.
739            U_ASSERT(fPattern->fMinMatchLen > 0);
740            UTEXT_SETNATIVEINDEX(fInputText, startPos);
741            for (;;) {
742                int64_t pos = startPos;
743                c = UTEXT_NEXT32(fInputText);
744                startPos = UTEXT_GETNATIVEINDEX(fInputText);
745                // c will be -1 (U_SENTINEL) at end of text, in which case we
746                // skip this next block (so we don't have a negative array index)
747                // and handle end of text in the following block.
748                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
750                    MatchAt(pos, FALSE, status);
751                    if (U_FAILURE(status)) {
752                        return FALSE;
753                    }
754                    if (fMatch) {
755                        return TRUE;
756                    }
757                    UTEXT_SETNATIVEINDEX(fInputText, pos);
758                }
759                if (startPos > testStartLimit) {
760                    fMatch = FALSE;
761                    fHitEnd = TRUE;
762                    return FALSE;
763                }
764                if  (findProgressInterrupt(startPos, status))
765                    return FALSE;
766            }
767        }
768        U_ASSERT(FALSE);
769
770    case START_STRING:
771    case START_CHAR:
772        {
773            // Match starts on exactly one char.
774            U_ASSERT(fPattern->fMinMatchLen > 0);
775            UChar32 theChar = fPattern->fInitialChar;
776            UTEXT_SETNATIVEINDEX(fInputText, startPos);
777            for (;;) {
778                int64_t pos = startPos;
779                c = UTEXT_NEXT32(fInputText);
780                startPos = UTEXT_GETNATIVEINDEX(fInputText);
781                if (c == theChar) {
782                    MatchAt(pos, FALSE, status);
783                    if (U_FAILURE(status)) {
784                        return FALSE;
785                    }
786                    if (fMatch) {
787                        return TRUE;
788                    }
789                    UTEXT_SETNATIVEINDEX(fInputText, startPos);
790                }
791                if (startPos > testStartLimit) {
792                    fMatch = FALSE;
793                    fHitEnd = TRUE;
794                    return FALSE;
795                }
796                if  (findProgressInterrupt(startPos, status))
797                    return FALSE;
798           }
799        }
800        U_ASSERT(FALSE);
801
802    case START_LINE:
803        {
804            UChar32  c;
805            if (startPos == fAnchorStart) {
806                MatchAt(startPos, FALSE, status);
807                if (U_FAILURE(status)) {
808                    return FALSE;
809                }
810                if (fMatch) {
811                    return TRUE;
812                }
813                UTEXT_SETNATIVEINDEX(fInputText, startPos);
814                c = UTEXT_NEXT32(fInputText);
815                startPos = UTEXT_GETNATIVEINDEX(fInputText);
816            } else {
817                UTEXT_SETNATIVEINDEX(fInputText, startPos);
818                c = UTEXT_PREVIOUS32(fInputText);
819                UTEXT_SETNATIVEINDEX(fInputText, startPos);
820            }
821
822            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823                for (;;) {
824                    if (c == 0x0a) {
825                            MatchAt(startPos, FALSE, status);
826                            if (U_FAILURE(status)) {
827                                return FALSE;
828                            }
829                            if (fMatch) {
830                                return TRUE;
831                            }
832                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
833                    }
834                    if (startPos >= testStartLimit) {
835                        fMatch = FALSE;
836                        fHitEnd = TRUE;
837                        return FALSE;
838                    }
839                    c = UTEXT_NEXT32(fInputText);
840                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
841                    // Note that it's perfectly OK for a pattern to have a zero-length
842                    //   match at the end of a string, so we must make sure that the loop
843                    //   runs with startPos == testStartLimit the last time through.
844                    if  (findProgressInterrupt(startPos, status))
845                        return FALSE;
846                }
847            } else {
848                for (;;) {
849                    if (isLineTerminator(c)) {
850                        if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851                            (void)UTEXT_NEXT32(fInputText);
852                            startPos = UTEXT_GETNATIVEINDEX(fInputText);
853                        }
854                        MatchAt(startPos, FALSE, status);
855                        if (U_FAILURE(status)) {
856                            return FALSE;
857                        }
858                        if (fMatch) {
859                            return TRUE;
860                        }
861                        UTEXT_SETNATIVEINDEX(fInputText, startPos);
862                    }
863                    if (startPos >= testStartLimit) {
864                        fMatch = FALSE;
865                        fHitEnd = TRUE;
866                        return FALSE;
867                    }
868                    c = UTEXT_NEXT32(fInputText);
869                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
870                    // Note that it's perfectly OK for a pattern to have a zero-length
871                    //   match at the end of a string, so we must make sure that the loop
872                    //   runs with startPos == testStartLimit the last time through.
873                    if  (findProgressInterrupt(startPos, status))
874                        return FALSE;
875                }
876            }
877        }
878
879    default:
880        U_ASSERT(FALSE);
881    }
882
883    U_ASSERT(FALSE);
884    return FALSE;
885}
886
887
888
889UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
890    if (U_FAILURE(status)) {
891        return FALSE;
892    }
893    if (U_FAILURE(fDeferredStatus)) {
894        status = fDeferredStatus;
895        return FALSE;
896    }
897    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
898                                          //        This will reset the region to be the full input length.
899    if (start < 0) {
900        status = U_INDEX_OUTOFBOUNDS_ERROR;
901        return FALSE;
902    }
903
904    int64_t nativeStart = start;
905    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
906        status = U_INDEX_OUTOFBOUNDS_ERROR;
907        return FALSE;
908    }
909    fMatchEnd = nativeStart;
910    return find(status);
911}
912
913
914//--------------------------------------------------------------------------------
915//
916//   findUsingChunk() -- like find(), but with the advance knowledge that the
917//                       entire string is available in the UText's chunk buffer.
918//
919//--------------------------------------------------------------------------------
920UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
921    // Start at the position of the last match end.  (Will be zero if the
922    //   matcher has been reset.
923    //
924
925    int32_t startPos = (int32_t)fMatchEnd;
926    if (startPos==0) {
927        startPos = (int32_t)fActiveStart;
928    }
929
930    const UChar *inputBuf = fInputText->chunkContents;
931
932    if (fMatch) {
933        // Save the position of any previous successful match.
934        fLastMatchEnd = fMatchEnd;
935
936        if (fMatchStart == fMatchEnd) {
937            // Previous match had zero length.  Move start position up one position
938            //  to avoid sending find() into a loop on zero-length matches.
939            if (startPos >= fActiveLimit) {
940                fMatch = FALSE;
941                fHitEnd = TRUE;
942                return FALSE;
943            }
944            U16_FWD_1(inputBuf, startPos, fInputLength);
945        }
946    } else {
947        if (fLastMatchEnd >= 0) {
948            // A previous find() failed to match.  Don't try again.
949            //   (without this test, a pattern with a zero-length match
950            //    could match again at the end of an input string.)
951            fHitEnd = TRUE;
952            return FALSE;
953        }
954    }
955
956
957    // Compute the position in the input string beyond which a match can not begin, because
958    //   the minimum length match would extend past the end of the input.
959    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
960    //          Be aware of possible overflows if making changes here.
961    //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
962    int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
963    if (startPos > testLen) {
964        fMatch = FALSE;
965        fHitEnd = TRUE;
966        return FALSE;
967    }
968
969    UChar32  c;
970    U_ASSERT(startPos >= 0);
971
972    switch (fPattern->fStartType) {
973    case START_NO_INFO:
974        // No optimization was found.
975        //  Try a match at each input position.
976        for (;;) {
977            MatchChunkAt(startPos, FALSE, status);
978            if (U_FAILURE(status)) {
979                return FALSE;
980            }
981            if (fMatch) {
982                return TRUE;
983            }
984            if (startPos >= testLen) {
985                fHitEnd = TRUE;
986                return FALSE;
987            }
988            U16_FWD_1(inputBuf, startPos, fActiveLimit);
989            // Note that it's perfectly OK for a pattern to have a zero-length
990            //   match at the end of a string, so we must make sure that the loop
991            //   runs with startPos == testLen the last time through.
992            if  (findProgressInterrupt(startPos, status))
993                return FALSE;
994        }
995        U_ASSERT(FALSE);
996
997    case START_START:
998        // Matches are only possible at the start of the input string
999        //   (pattern begins with ^ or \A)
1000        if (startPos > fActiveStart) {
1001            fMatch = FALSE;
1002            return FALSE;
1003        }
1004        MatchChunkAt(startPos, FALSE, status);
1005        if (U_FAILURE(status)) {
1006            return FALSE;
1007        }
1008        return fMatch;
1009
1010
1011    case START_SET:
1012    {
1013        // Match may start on any char from a pre-computed set.
1014        U_ASSERT(fPattern->fMinMatchLen > 0);
1015        for (;;) {
1016            int32_t pos = startPos;
1017            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1018            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1019                (c>=256 && fPattern->fInitialChars->contains(c))) {
1020                MatchChunkAt(pos, FALSE, status);
1021                if (U_FAILURE(status)) {
1022                    return FALSE;
1023                }
1024                if (fMatch) {
1025                    return TRUE;
1026                }
1027            }
1028            if (startPos > testLen) {
1029                fMatch = FALSE;
1030                fHitEnd = TRUE;
1031                return FALSE;
1032            }
1033            if  (findProgressInterrupt(startPos, status))
1034                return FALSE;
1035        }
1036    }
1037        U_ASSERT(FALSE);
1038
1039    case START_STRING:
1040    case START_CHAR:
1041    {
1042        // Match starts on exactly one char.
1043        U_ASSERT(fPattern->fMinMatchLen > 0);
1044        UChar32 theChar = fPattern->fInitialChar;
1045        for (;;) {
1046            int32_t pos = startPos;
1047            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1048            if (c == theChar) {
1049                MatchChunkAt(pos, FALSE, status);
1050                if (U_FAILURE(status)) {
1051                    return FALSE;
1052                }
1053                if (fMatch) {
1054                    return TRUE;
1055                }
1056            }
1057            if (startPos > testLen) {
1058                fMatch = FALSE;
1059                fHitEnd = TRUE;
1060                return FALSE;
1061            }
1062            if  (findProgressInterrupt(startPos, status))
1063                return FALSE;
1064        }
1065    }
1066    U_ASSERT(FALSE);
1067
1068    case START_LINE:
1069    {
1070        UChar32  c;
1071        if (startPos == fAnchorStart) {
1072            MatchChunkAt(startPos, FALSE, status);
1073            if (U_FAILURE(status)) {
1074                return FALSE;
1075            }
1076            if (fMatch) {
1077                return TRUE;
1078            }
1079            U16_FWD_1(inputBuf, startPos, fActiveLimit);
1080        }
1081
1082        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1083            for (;;) {
1084                c = inputBuf[startPos-1];
1085                if (c == 0x0a) {
1086                    MatchChunkAt(startPos, FALSE, status);
1087                    if (U_FAILURE(status)) {
1088                        return FALSE;
1089                    }
1090                    if (fMatch) {
1091                        return TRUE;
1092                    }
1093                }
1094                if (startPos >= testLen) {
1095                    fMatch = FALSE;
1096                    fHitEnd = TRUE;
1097                    return FALSE;
1098                }
1099                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1100                // Note that it's perfectly OK for a pattern to have a zero-length
1101                //   match at the end of a string, so we must make sure that the loop
1102                //   runs with startPos == testLen the last time through.
1103                if  (findProgressInterrupt(startPos, status))
1104                    return FALSE;
1105            }
1106        } else {
1107            for (;;) {
1108                c = inputBuf[startPos-1];
1109                if (isLineTerminator(c)) {
1110                    if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1111                        startPos++;
1112                    }
1113                    MatchChunkAt(startPos, FALSE, status);
1114                    if (U_FAILURE(status)) {
1115                        return FALSE;
1116                    }
1117                    if (fMatch) {
1118                        return TRUE;
1119                    }
1120                }
1121                if (startPos >= testLen) {
1122                    fMatch = FALSE;
1123                    fHitEnd = TRUE;
1124                    return FALSE;
1125                }
1126                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1127                // Note that it's perfectly OK for a pattern to have a zero-length
1128                //   match at the end of a string, so we must make sure that the loop
1129                //   runs with startPos == testLen the last time through.
1130                if  (findProgressInterrupt(startPos, status))
1131                    return FALSE;
1132            }
1133        }
1134    }
1135
1136    default:
1137        U_ASSERT(FALSE);
1138    }
1139
1140    U_ASSERT(FALSE);
1141    return FALSE;
1142}
1143
1144
1145
1146//--------------------------------------------------------------------------------
1147//
1148//  group()
1149//
1150//--------------------------------------------------------------------------------
1151UnicodeString RegexMatcher::group(UErrorCode &status) const {
1152    return group(0, status);
1153}
1154
1155//  Return immutable shallow clone
1156UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1157    return group(0, dest, group_len, status);
1158}
1159
1160//  Return immutable shallow clone
1161UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1162    group_len = 0;
1163    if (U_FAILURE(status)) {
1164        return dest;
1165    }
1166    if (U_FAILURE(fDeferredStatus)) {
1167        status = fDeferredStatus;
1168    } else if (fMatch == FALSE) {
1169        status = U_REGEX_INVALID_STATE;
1170    } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1171        status = U_INDEX_OUTOFBOUNDS_ERROR;
1172    }
1173
1174    if (U_FAILURE(status)) {
1175        return dest;
1176    }
1177
1178    int64_t s, e;
1179    if (groupNum == 0) {
1180        s = fMatchStart;
1181        e = fMatchEnd;
1182    } else {
1183        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1184        U_ASSERT(groupOffset < fPattern->fFrameSize);
1185        U_ASSERT(groupOffset >= 0);
1186        s = fFrame->fExtra[groupOffset];
1187        e = fFrame->fExtra[groupOffset+1];
1188    }
1189
1190    if (s < 0) {
1191        // A capture group wasn't part of the match
1192        return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1193    }
1194    U_ASSERT(s <= e);
1195    group_len = e - s;
1196
1197    dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1198    if (dest)
1199        UTEXT_SETNATIVEINDEX(dest, s);
1200    return dest;
1201}
1202
1203UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1204    UnicodeString result;
1205    int64_t groupStart = start64(groupNum, status);
1206    int64_t groupEnd = end64(groupNum, status);
1207    if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1208        return result;
1209    }
1210
1211    // Get the group length using a utext_extract preflight.
1212    //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
1213    int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1214    if (status != U_BUFFER_OVERFLOW_ERROR) {
1215        return result;
1216    }
1217
1218    status = U_ZERO_ERROR;
1219    UChar *buf = result.getBuffer(length);
1220    if (buf == NULL) {
1221        status = U_MEMORY_ALLOCATION_ERROR;
1222    } else {
1223        int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1224        result.releaseBuffer(extractLength);
1225        U_ASSERT(length == extractLength);
1226    }
1227    return result;
1228}
1229
1230
1231//--------------------------------------------------------------------------------
1232//
1233//  appendGroup() -- currently internal only, appends a group to a UText rather
1234//                   than replacing its contents
1235//
1236//--------------------------------------------------------------------------------
1237
1238int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1239    if (U_FAILURE(status)) {
1240        return 0;
1241    }
1242    if (U_FAILURE(fDeferredStatus)) {
1243        status = fDeferredStatus;
1244        return 0;
1245    }
1246    int64_t destLen = utext_nativeLength(dest);
1247
1248    if (fMatch == FALSE) {
1249        status = U_REGEX_INVALID_STATE;
1250        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1251    }
1252    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1253        status = U_INDEX_OUTOFBOUNDS_ERROR;
1254        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1255    }
1256
1257    int64_t s, e;
1258    if (groupNum == 0) {
1259        s = fMatchStart;
1260        e = fMatchEnd;
1261    } else {
1262        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1263        U_ASSERT(groupOffset < fPattern->fFrameSize);
1264        U_ASSERT(groupOffset >= 0);
1265        s = fFrame->fExtra[groupOffset];
1266        e = fFrame->fExtra[groupOffset+1];
1267    }
1268
1269    if (s < 0) {
1270        // A capture group wasn't part of the match
1271        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1272    }
1273    U_ASSERT(s <= e);
1274
1275    int64_t deltaLen;
1276    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1277        U_ASSERT(e <= fInputLength);
1278        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1279    } else {
1280        int32_t len16;
1281        if (UTEXT_USES_U16(fInputText)) {
1282            len16 = (int32_t)(e-s);
1283        } else {
1284            UErrorCode lengthStatus = U_ZERO_ERROR;
1285            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1286        }
1287        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1288        if (groupChars == NULL) {
1289            status = U_MEMORY_ALLOCATION_ERROR;
1290            return 0;
1291        }
1292        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1293
1294        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1295        uprv_free(groupChars);
1296    }
1297    return deltaLen;
1298}
1299
1300
1301
1302//--------------------------------------------------------------------------------
1303//
1304//  groupCount()
1305//
1306//--------------------------------------------------------------------------------
1307int32_t RegexMatcher::groupCount() const {
1308    return fPattern->fGroupMap->size();
1309}
1310
1311//--------------------------------------------------------------------------------
1312//
1313//  hasAnchoringBounds()
1314//
1315//--------------------------------------------------------------------------------
1316UBool RegexMatcher::hasAnchoringBounds() const {
1317    return fAnchoringBounds;
1318}
1319
1320
1321//--------------------------------------------------------------------------------
1322//
1323//  hasTransparentBounds()
1324//
1325//--------------------------------------------------------------------------------
1326UBool RegexMatcher::hasTransparentBounds() const {
1327    return fTransparentBounds;
1328}
1329
1330
1331
1332//--------------------------------------------------------------------------------
1333//
1334//  hitEnd()
1335//
1336//--------------------------------------------------------------------------------
1337UBool RegexMatcher::hitEnd() const {
1338    return fHitEnd;
1339}
1340
1341
1342//--------------------------------------------------------------------------------
1343//
1344//  input()
1345//
1346//--------------------------------------------------------------------------------
1347const UnicodeString &RegexMatcher::input() const {
1348    if (!fInput) {
1349        UErrorCode status = U_ZERO_ERROR;
1350        int32_t len16;
1351        if (UTEXT_USES_U16(fInputText)) {
1352            len16 = (int32_t)fInputLength;
1353        } else {
1354            len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1355            status = U_ZERO_ERROR; // overflow, length status
1356        }
1357        UnicodeString *result = new UnicodeString(len16, 0, 0);
1358
1359        UChar *inputChars = result->getBuffer(len16);
1360        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1361        result->releaseBuffer(len16);
1362
1363        (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1364    }
1365
1366    return *fInput;
1367}
1368
1369//--------------------------------------------------------------------------------
1370//
1371//  inputText()
1372//
1373//--------------------------------------------------------------------------------
1374UText *RegexMatcher::inputText() const {
1375    return fInputText;
1376}
1377
1378
1379//--------------------------------------------------------------------------------
1380//
1381//  getInput() -- like inputText(), but makes a clone or copies into another UText
1382//
1383//--------------------------------------------------------------------------------
1384UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1385    if (U_FAILURE(status)) {
1386        return dest;
1387    }
1388    if (U_FAILURE(fDeferredStatus)) {
1389        status = fDeferredStatus;
1390        return dest;
1391    }
1392
1393    if (dest) {
1394        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1395            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1396        } else {
1397            int32_t input16Len;
1398            if (UTEXT_USES_U16(fInputText)) {
1399                input16Len = (int32_t)fInputLength;
1400            } else {
1401                UErrorCode lengthStatus = U_ZERO_ERROR;
1402                input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1403            }
1404            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1405            if (inputChars == NULL) {
1406                return dest;
1407            }
1408
1409            status = U_ZERO_ERROR;
1410            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1411            status = U_ZERO_ERROR;
1412            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1413
1414            uprv_free(inputChars);
1415        }
1416        return dest;
1417    } else {
1418        return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1419    }
1420}
1421
1422
1423static UBool compat_SyncMutableUTextContents(UText *ut);
1424static UBool compat_SyncMutableUTextContents(UText *ut) {
1425    UBool retVal = FALSE;
1426
1427    //  In the following test, we're really only interested in whether the UText should switch
1428    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1429    //  will still point to the correct data.
1430    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1431        UnicodeString *us=(UnicodeString *)ut->context;
1432
1433        // Update to the latest length.
1434        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1435        int32_t newLength = us->length();
1436
1437        // Update the chunk description.
1438        // The buffer may have switched between stack- and heap-based.
1439        ut->chunkContents    = us->getBuffer();
1440        ut->chunkLength      = newLength;
1441        ut->chunkNativeLimit = newLength;
1442        ut->nativeIndexingLimit = newLength;
1443        retVal = TRUE;
1444    }
1445
1446    return retVal;
1447}
1448
1449//--------------------------------------------------------------------------------
1450//
1451//  lookingAt()
1452//
1453//--------------------------------------------------------------------------------
1454UBool RegexMatcher::lookingAt(UErrorCode &status) {
1455    if (U_FAILURE(status)) {
1456        return FALSE;
1457    }
1458    if (U_FAILURE(fDeferredStatus)) {
1459        status = fDeferredStatus;
1460        return FALSE;
1461    }
1462
1463    if (fInputUniStrMaybeMutable) {
1464        if (compat_SyncMutableUTextContents(fInputText)) {
1465        fInputLength = utext_nativeLength(fInputText);
1466        reset();
1467        }
1468    }
1469    else {
1470        resetPreserveRegion();
1471    }
1472    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1473        MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1474    } else {
1475        MatchAt(fActiveStart, FALSE, status);
1476    }
1477    return fMatch;
1478}
1479
1480
1481UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1482    if (U_FAILURE(status)) {
1483        return FALSE;
1484    }
1485    if (U_FAILURE(fDeferredStatus)) {
1486        status = fDeferredStatus;
1487        return FALSE;
1488    }
1489    reset();
1490
1491    if (start < 0) {
1492        status = U_INDEX_OUTOFBOUNDS_ERROR;
1493        return FALSE;
1494    }
1495
1496    if (fInputUniStrMaybeMutable) {
1497        if (compat_SyncMutableUTextContents(fInputText)) {
1498        fInputLength = utext_nativeLength(fInputText);
1499        reset();
1500        }
1501    }
1502
1503    int64_t nativeStart;
1504    nativeStart = start;
1505    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1506        status = U_INDEX_OUTOFBOUNDS_ERROR;
1507        return FALSE;
1508    }
1509
1510    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1511        MatchChunkAt((int32_t)nativeStart, FALSE, status);
1512    } else {
1513        MatchAt(nativeStart, FALSE, status);
1514    }
1515    return fMatch;
1516}
1517
1518
1519
1520//--------------------------------------------------------------------------------
1521//
1522//  matches()
1523//
1524//--------------------------------------------------------------------------------
1525UBool RegexMatcher::matches(UErrorCode &status) {
1526    if (U_FAILURE(status)) {
1527        return FALSE;
1528    }
1529    if (U_FAILURE(fDeferredStatus)) {
1530        status = fDeferredStatus;
1531        return FALSE;
1532    }
1533
1534    if (fInputUniStrMaybeMutable) {
1535        if (compat_SyncMutableUTextContents(fInputText)) {
1536        fInputLength = utext_nativeLength(fInputText);
1537        reset();
1538        }
1539    }
1540    else {
1541        resetPreserveRegion();
1542    }
1543
1544    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1545        MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1546    } else {
1547        MatchAt(fActiveStart, TRUE, status);
1548    }
1549    return fMatch;
1550}
1551
1552
1553UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1554    if (U_FAILURE(status)) {
1555        return FALSE;
1556    }
1557    if (U_FAILURE(fDeferredStatus)) {
1558        status = fDeferredStatus;
1559        return FALSE;
1560    }
1561    reset();
1562
1563    if (start < 0) {
1564        status = U_INDEX_OUTOFBOUNDS_ERROR;
1565        return FALSE;
1566    }
1567
1568    if (fInputUniStrMaybeMutable) {
1569        if (compat_SyncMutableUTextContents(fInputText)) {
1570        fInputLength = utext_nativeLength(fInputText);
1571        reset();
1572        }
1573    }
1574
1575    int64_t nativeStart;
1576    nativeStart = start;
1577    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1578        status = U_INDEX_OUTOFBOUNDS_ERROR;
1579        return FALSE;
1580    }
1581
1582    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1583        MatchChunkAt((int32_t)nativeStart, TRUE, status);
1584    } else {
1585        MatchAt(nativeStart, TRUE, status);
1586    }
1587    return fMatch;
1588}
1589
1590
1591
1592//--------------------------------------------------------------------------------
1593//
1594//    pattern
1595//
1596//--------------------------------------------------------------------------------
1597const RegexPattern &RegexMatcher::pattern() const {
1598    return *fPattern;
1599}
1600
1601
1602
1603//--------------------------------------------------------------------------------
1604//
1605//    region
1606//
1607//--------------------------------------------------------------------------------
1608RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1609    if (U_FAILURE(status)) {
1610        return *this;
1611    }
1612
1613    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1614        status = U_ILLEGAL_ARGUMENT_ERROR;
1615    }
1616
1617    int64_t nativeStart = regionStart;
1618    int64_t nativeLimit = regionLimit;
1619    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1620      status = U_ILLEGAL_ARGUMENT_ERROR;
1621    }
1622
1623    if (startIndex == -1)
1624      this->reset();
1625    else
1626      resetPreserveRegion();
1627
1628    fRegionStart = nativeStart;
1629    fRegionLimit = nativeLimit;
1630    fActiveStart = nativeStart;
1631    fActiveLimit = nativeLimit;
1632
1633    if (startIndex != -1) {
1634      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1635          status = U_INDEX_OUTOFBOUNDS_ERROR;
1636      }
1637      fMatchEnd = startIndex;
1638    }
1639
1640    if (!fTransparentBounds) {
1641        fLookStart = nativeStart;
1642        fLookLimit = nativeLimit;
1643    }
1644    if (fAnchoringBounds) {
1645        fAnchorStart = nativeStart;
1646        fAnchorLimit = nativeLimit;
1647    }
1648    return *this;
1649}
1650
1651RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1652  return region(start, limit, -1, status);
1653}
1654
1655//--------------------------------------------------------------------------------
1656//
1657//    regionEnd
1658//
1659//--------------------------------------------------------------------------------
1660int32_t RegexMatcher::regionEnd() const {
1661    return (int32_t)fRegionLimit;
1662}
1663
1664int64_t RegexMatcher::regionEnd64() const {
1665    return fRegionLimit;
1666}
1667
1668//--------------------------------------------------------------------------------
1669//
1670//    regionStart
1671//
1672//--------------------------------------------------------------------------------
1673int32_t RegexMatcher::regionStart() const {
1674    return (int32_t)fRegionStart;
1675}
1676
1677int64_t RegexMatcher::regionStart64() const {
1678    return fRegionStart;
1679}
1680
1681
1682//--------------------------------------------------------------------------------
1683//
1684//    replaceAll
1685//
1686//--------------------------------------------------------------------------------
1687UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1688    UText replacementText = UTEXT_INITIALIZER;
1689    UText resultText = UTEXT_INITIALIZER;
1690    UnicodeString resultString;
1691    if (U_FAILURE(status)) {
1692        return resultString;
1693    }
1694
1695    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1696    utext_openUnicodeString(&resultText, &resultString, &status);
1697
1698    replaceAll(&replacementText, &resultText, status);
1699
1700    utext_close(&resultText);
1701    utext_close(&replacementText);
1702
1703    return resultString;
1704}
1705
1706
1707//
1708//    replaceAll, UText mode
1709//
1710UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1711    if (U_FAILURE(status)) {
1712        return dest;
1713    }
1714    if (U_FAILURE(fDeferredStatus)) {
1715        status = fDeferredStatus;
1716        return dest;
1717    }
1718
1719    if (dest == NULL) {
1720        UnicodeString emptyString;
1721        UText empty = UTEXT_INITIALIZER;
1722
1723        utext_openUnicodeString(&empty, &emptyString, &status);
1724        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1725        utext_close(&empty);
1726    }
1727
1728    if (U_SUCCESS(status)) {
1729        reset();
1730        while (find()) {
1731            appendReplacement(dest, replacement, status);
1732            if (U_FAILURE(status)) {
1733                break;
1734            }
1735        }
1736        appendTail(dest, status);
1737    }
1738
1739    return dest;
1740}
1741
1742
1743//--------------------------------------------------------------------------------
1744//
1745//    replaceFirst
1746//
1747//--------------------------------------------------------------------------------
1748UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1749    UText replacementText = UTEXT_INITIALIZER;
1750    UText resultText = UTEXT_INITIALIZER;
1751    UnicodeString resultString;
1752
1753    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754    utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756    replaceFirst(&replacementText, &resultText, status);
1757
1758    utext_close(&resultText);
1759    utext_close(&replacementText);
1760
1761    return resultString;
1762}
1763
1764//
1765//    replaceFirst, UText mode
1766//
1767UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1768    if (U_FAILURE(status)) {
1769        return dest;
1770    }
1771    if (U_FAILURE(fDeferredStatus)) {
1772        status = fDeferredStatus;
1773        return dest;
1774    }
1775
1776    reset();
1777    if (!find()) {
1778        return getInput(dest, status);
1779    }
1780
1781    if (dest == NULL) {
1782        UnicodeString emptyString;
1783        UText empty = UTEXT_INITIALIZER;
1784
1785        utext_openUnicodeString(&empty, &emptyString, &status);
1786        dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1787        utext_close(&empty);
1788    }
1789
1790    appendReplacement(dest, replacement, status);
1791    appendTail(dest, status);
1792
1793    return dest;
1794}
1795
1796
1797//--------------------------------------------------------------------------------
1798//
1799//     requireEnd
1800//
1801//--------------------------------------------------------------------------------
1802UBool RegexMatcher::requireEnd() const {
1803    return fRequireEnd;
1804}
1805
1806
1807//--------------------------------------------------------------------------------
1808//
1809//     reset
1810//
1811//--------------------------------------------------------------------------------
1812RegexMatcher &RegexMatcher::reset() {
1813    fRegionStart    = 0;
1814    fRegionLimit    = fInputLength;
1815    fActiveStart    = 0;
1816    fActiveLimit    = fInputLength;
1817    fAnchorStart    = 0;
1818    fAnchorLimit    = fInputLength;
1819    fLookStart      = 0;
1820    fLookLimit      = fInputLength;
1821    resetPreserveRegion();
1822    return *this;
1823}
1824
1825
1826
1827void RegexMatcher::resetPreserveRegion() {
1828    fMatchStart     = 0;
1829    fMatchEnd       = 0;
1830    fLastMatchEnd   = -1;
1831    fAppendPosition = 0;
1832    fMatch          = FALSE;
1833    fHitEnd         = FALSE;
1834    fRequireEnd     = FALSE;
1835    fTime           = 0;
1836    fTickCounter    = TIMER_INITIAL_VALUE;
1837    //resetStack(); // more expensive than it looks...
1838}
1839
1840
1841RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1842    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1843    if (fPattern->fNeedsAltInput) {
1844        fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1845    }
1846    if (U_FAILURE(fDeferredStatus)) {
1847        return *this;
1848    }
1849    fInputLength = utext_nativeLength(fInputText);
1850
1851    reset();
1852    delete fInput;
1853    fInput = NULL;
1854
1855    //  Do the following for any UnicodeString.
1856    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1857    fInputUniStrMaybeMutable = TRUE;
1858
1859    if (fWordBreakItr != NULL) {
1860#if UCONFIG_NO_BREAK_ITERATION==0
1861        UErrorCode status = U_ZERO_ERROR;
1862        fWordBreakItr->setText(fInputText, status);
1863#endif
1864    }
1865    return *this;
1866}
1867
1868
1869RegexMatcher &RegexMatcher::reset(UText *input) {
1870    if (fInputText != input) {
1871        fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1872        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1873        if (U_FAILURE(fDeferredStatus)) {
1874            return *this;
1875        }
1876        fInputLength = utext_nativeLength(fInputText);
1877
1878        delete fInput;
1879        fInput = NULL;
1880
1881        if (fWordBreakItr != NULL) {
1882#if UCONFIG_NO_BREAK_ITERATION==0
1883            UErrorCode status = U_ZERO_ERROR;
1884            fWordBreakItr->setText(input, status);
1885#endif
1886        }
1887    }
1888    reset();
1889    fInputUniStrMaybeMutable = FALSE;
1890
1891    return *this;
1892}
1893
1894/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1895    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1896    return *this;
1897}*/
1898
1899RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1900    if (U_FAILURE(status)) {
1901        return *this;
1902    }
1903    reset();       // Reset also resets the region to be the entire string.
1904
1905    if (position < 0 || position > fActiveLimit) {
1906        status = U_INDEX_OUTOFBOUNDS_ERROR;
1907        return *this;
1908    }
1909    fMatchEnd = position;
1910    return *this;
1911}
1912
1913
1914//--------------------------------------------------------------------------------
1915//
1916//    refresh
1917//
1918//--------------------------------------------------------------------------------
1919RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1920    if (U_FAILURE(status)) {
1921        return *this;
1922    }
1923    if (input == NULL) {
1924        status = U_ILLEGAL_ARGUMENT_ERROR;
1925        return *this;
1926    }
1927    if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1928        status = U_ILLEGAL_ARGUMENT_ERROR;
1929        return *this;
1930    }
1931    int64_t  pos = utext_getNativeIndex(fInputText);
1932    //  Shallow read-only clone of the new UText into the existing input UText
1933    fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1934    if (U_FAILURE(status)) {
1935        return *this;
1936    }
1937    utext_setNativeIndex(fInputText, pos);
1938
1939    if (fAltInputText != NULL) {
1940        pos = utext_getNativeIndex(fAltInputText);
1941        fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1942        if (U_FAILURE(status)) {
1943            return *this;
1944        }
1945        utext_setNativeIndex(fAltInputText, pos);
1946    }
1947    return *this;
1948}
1949
1950
1951
1952//--------------------------------------------------------------------------------
1953//
1954//    setTrace
1955//
1956//--------------------------------------------------------------------------------
1957void RegexMatcher::setTrace(UBool state) {
1958    fTraceDebug = state;
1959}
1960
1961
1962
1963/**
1964  *  UText, replace entire contents of the destination UText with a substring of the source UText.
1965  *
1966  *     @param src    The source UText
1967  *     @param dest   The destination UText. Must be writable.
1968  *                   May be NULL, in which case a new UText will be allocated.
1969  *     @param start  Start index of source substring.
1970  *     @param limit  Limit index of source substring.
1971  *     @param status An error code.
1972  */
1973static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1974    if (U_FAILURE(*status)) {
1975        return dest;
1976    }
1977    if (start == limit) {
1978        if (dest) {
1979            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1980            return dest;
1981        } else {
1982            return utext_openUChars(NULL, NULL, 0, status);
1983        }
1984    }
1985    int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1986    if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1987        return dest;
1988    }
1989    *status = U_ZERO_ERROR;
1990    MaybeStackArray<UChar, 40> buffer;
1991    if (length >= buffer.getCapacity()) {
1992        UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
1993        if (newBuf == NULL) {
1994            *status = U_MEMORY_ALLOCATION_ERROR;
1995        }
1996    }
1997    utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1998    if (dest) {
1999        utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2000        return dest;
2001    }
2002
2003    // Caller did not provide a prexisting UText.
2004    // Open a new one, and have it adopt the text buffer storage.
2005    if (U_FAILURE(*status)) {
2006        return NULL;
2007    }
2008    int32_t ownedLength = 0;
2009    UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2010    if (ownedBuf == NULL) {
2011        *status = U_MEMORY_ALLOCATION_ERROR;
2012        return NULL;
2013    }
2014    UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2015    if (U_FAILURE(*status)) {
2016        uprv_free(ownedBuf);
2017        return NULL;
2018    }
2019    result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2020    return result;
2021}
2022
2023
2024//---------------------------------------------------------------------
2025//
2026//   split
2027//
2028//---------------------------------------------------------------------
2029int32_t  RegexMatcher::split(const UnicodeString &input,
2030        UnicodeString    dest[],
2031        int32_t          destCapacity,
2032        UErrorCode      &status)
2033{
2034    UText inputText = UTEXT_INITIALIZER;
2035    utext_openConstUnicodeString(&inputText, &input, &status);
2036    if (U_FAILURE(status)) {
2037        return 0;
2038    }
2039
2040    UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2041    if (destText == NULL) {
2042        status = U_MEMORY_ALLOCATION_ERROR;
2043        return 0;
2044    }
2045    int32_t i;
2046    for (i = 0; i < destCapacity; i++) {
2047        destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2048    }
2049
2050    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2051
2052    for (i = 0; i < destCapacity; i++) {
2053        utext_close(destText[i]);
2054    }
2055
2056    uprv_free(destText);
2057    utext_close(&inputText);
2058    return fieldCount;
2059}
2060
2061//
2062//   split, UText mode
2063//
2064int32_t  RegexMatcher::split(UText *input,
2065        UText           *dest[],
2066        int32_t          destCapacity,
2067        UErrorCode      &status)
2068{
2069    //
2070    // Check arguements for validity
2071    //
2072    if (U_FAILURE(status)) {
2073        return 0;
2074    };
2075
2076    if (destCapacity < 1) {
2077        status = U_ILLEGAL_ARGUMENT_ERROR;
2078        return 0;
2079    }
2080
2081    //
2082    // Reset for the input text
2083    //
2084    reset(input);
2085    int64_t   nextOutputStringStart = 0;
2086    if (fActiveLimit == 0) {
2087        return 0;
2088    }
2089
2090    //
2091    // Loop through the input text, searching for the delimiter pattern
2092    //
2093    int32_t i;
2094    int32_t numCaptureGroups = fPattern->fGroupMap->size();
2095    for (i=0; ; i++) {
2096        if (i>=destCapacity-1) {
2097            // There is one or zero output string left.
2098            // Fill the last output string with whatever is left from the input, then exit the loop.
2099            //  ( i will be == destCapacity if we filled the output array while processing
2100            //    capture groups of the delimiter expression, in which case we will discard the
2101            //    last capture group saved in favor of the unprocessed remainder of the
2102            //    input string.)
2103            i = destCapacity-1;
2104            if (fActiveLimit > nextOutputStringStart) {
2105                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2106                    if (dest[i]) {
2107                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2108                                      input->chunkContents+nextOutputStringStart,
2109                                      (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2110                    } else {
2111                        UText remainingText = UTEXT_INITIALIZER;
2112                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2113                                         fActiveLimit-nextOutputStringStart, &status);
2114                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2115                        utext_close(&remainingText);
2116                    }
2117                } else {
2118                    UErrorCode lengthStatus = U_ZERO_ERROR;
2119                    int32_t remaining16Length =
2120                        utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2121                    UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2122                    if (remainingChars == NULL) {
2123                        status = U_MEMORY_ALLOCATION_ERROR;
2124                        break;
2125                    }
2126
2127                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2128                    if (dest[i]) {
2129                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2130                    } else {
2131                        UText remainingText = UTEXT_INITIALIZER;
2132                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2133                        dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2134                        utext_close(&remainingText);
2135                    }
2136
2137                    uprv_free(remainingChars);
2138                }
2139            }
2140            break;
2141        }
2142        if (find()) {
2143            // We found another delimiter.  Move everything from where we started looking
2144            //  up until the start of the delimiter into the next output string.
2145            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2146                if (dest[i]) {
2147                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2148                                  input->chunkContents+nextOutputStringStart,
2149                                  (int32_t)(fMatchStart-nextOutputStringStart), &status);
2150                } else {
2151                    UText remainingText = UTEXT_INITIALIZER;
2152                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2153                                      fMatchStart-nextOutputStringStart, &status);
2154                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2155                    utext_close(&remainingText);
2156                }
2157            } else {
2158                UErrorCode lengthStatus = U_ZERO_ERROR;
2159                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2160                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2161                if (remainingChars == NULL) {
2162                    status = U_MEMORY_ALLOCATION_ERROR;
2163                    break;
2164                }
2165                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2166                if (dest[i]) {
2167                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2168                } else {
2169                    UText remainingText = UTEXT_INITIALIZER;
2170                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2171                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2172                    utext_close(&remainingText);
2173                }
2174
2175                uprv_free(remainingChars);
2176            }
2177            nextOutputStringStart = fMatchEnd;
2178
2179            // If the delimiter pattern has capturing parentheses, the captured
2180            //  text goes out into the next n destination strings.
2181            int32_t groupNum;
2182            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2183                if (i >= destCapacity-2) {
2184                    // Never fill the last available output string with capture group text.
2185                    // It will filled with the last field, the remainder of the
2186                    //  unsplit input text.
2187                    break;
2188                }
2189                i++;
2190                dest[i] = utext_extract_replace(fInputText, dest[i],
2191                                               start64(groupNum, status), end64(groupNum, status), &status);
2192            }
2193
2194            if (nextOutputStringStart == fActiveLimit) {
2195                // The delimiter was at the end of the string.  We're done, but first
2196                // we output one last empty string, for the empty field following
2197                //   the delimiter at the end of input.
2198                if (i+1 < destCapacity) {
2199                    ++i;
2200                    if (dest[i] == NULL) {
2201                        dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2202                    } else {
2203                        static UChar emptyString[] = {(UChar)0};
2204                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2205                    }
2206                }
2207                break;
2208
2209            }
2210        }
2211        else
2212        {
2213            // We ran off the end of the input while looking for the next delimiter.
2214            // All the remaining text goes into the current output string.
2215            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2216                if (dest[i]) {
2217                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2218                                  input->chunkContents+nextOutputStringStart,
2219                                  (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2220                } else {
2221                    UText remainingText = UTEXT_INITIALIZER;
2222                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2223                                     fActiveLimit-nextOutputStringStart, &status);
2224                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2225                    utext_close(&remainingText);
2226                }
2227            } else {
2228                UErrorCode lengthStatus = U_ZERO_ERROR;
2229                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2230                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2231                if (remainingChars == NULL) {
2232                    status = U_MEMORY_ALLOCATION_ERROR;
2233                    break;
2234                }
2235
2236                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2237                if (dest[i]) {
2238                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2239                } else {
2240                    UText remainingText = UTEXT_INITIALIZER;
2241                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2242                    dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2243                    utext_close(&remainingText);
2244                }
2245
2246                uprv_free(remainingChars);
2247            }
2248            break;
2249        }
2250        if (U_FAILURE(status)) {
2251            break;
2252        }
2253    }   // end of for loop
2254    return i+1;
2255}
2256
2257
2258//--------------------------------------------------------------------------------
2259//
2260//     start
2261//
2262//--------------------------------------------------------------------------------
2263int32_t RegexMatcher::start(UErrorCode &status) const {
2264    return start(0, status);
2265}
2266
2267int64_t RegexMatcher::start64(UErrorCode &status) const {
2268    return start64(0, status);
2269}
2270
2271//--------------------------------------------------------------------------------
2272//
2273//     start(int32_t group, UErrorCode &status)
2274//
2275//--------------------------------------------------------------------------------
2276
2277int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2278    if (U_FAILURE(status)) {
2279        return -1;
2280    }
2281    if (U_FAILURE(fDeferredStatus)) {
2282        status = fDeferredStatus;
2283        return -1;
2284    }
2285    if (fMatch == FALSE) {
2286        status = U_REGEX_INVALID_STATE;
2287        return -1;
2288    }
2289    if (group < 0 || group > fPattern->fGroupMap->size()) {
2290        status = U_INDEX_OUTOFBOUNDS_ERROR;
2291        return -1;
2292    }
2293    int64_t s;
2294    if (group == 0) {
2295        s = fMatchStart;
2296    } else {
2297        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2298        U_ASSERT(groupOffset < fPattern->fFrameSize);
2299        U_ASSERT(groupOffset >= 0);
2300        s = fFrame->fExtra[groupOffset];
2301    }
2302
2303    return s;
2304}
2305
2306
2307int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2308    return (int32_t)start64(group, status);
2309}
2310
2311//--------------------------------------------------------------------------------
2312//
2313//     useAnchoringBounds
2314//
2315//--------------------------------------------------------------------------------
2316RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2317    fAnchoringBounds = b;
2318    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2319    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2320    return *this;
2321}
2322
2323
2324//--------------------------------------------------------------------------------
2325//
2326//     useTransparentBounds
2327//
2328//--------------------------------------------------------------------------------
2329RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2330    fTransparentBounds = b;
2331    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2332    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2333    return *this;
2334}
2335
2336//--------------------------------------------------------------------------------
2337//
2338//     setTimeLimit
2339//
2340//--------------------------------------------------------------------------------
2341void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2342    if (U_FAILURE(status)) {
2343        return;
2344    }
2345    if (U_FAILURE(fDeferredStatus)) {
2346        status = fDeferredStatus;
2347        return;
2348    }
2349    if (limit < 0) {
2350        status = U_ILLEGAL_ARGUMENT_ERROR;
2351        return;
2352    }
2353    fTimeLimit = limit;
2354}
2355
2356
2357//--------------------------------------------------------------------------------
2358//
2359//     getTimeLimit
2360//
2361//--------------------------------------------------------------------------------
2362int32_t RegexMatcher::getTimeLimit() const {
2363    return fTimeLimit;
2364}
2365
2366
2367//--------------------------------------------------------------------------------
2368//
2369//     setStackLimit
2370//
2371//--------------------------------------------------------------------------------
2372void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2373    if (U_FAILURE(status)) {
2374        return;
2375    }
2376    if (U_FAILURE(fDeferredStatus)) {
2377        status = fDeferredStatus;
2378        return;
2379    }
2380    if (limit < 0) {
2381        status = U_ILLEGAL_ARGUMENT_ERROR;
2382        return;
2383    }
2384
2385    // Reset the matcher.  This is needed here in case there is a current match
2386    //    whose final stack frame (containing the match results, pointed to by fFrame)
2387    //    would be lost by resizing to a smaller stack size.
2388    reset();
2389
2390    if (limit == 0) {
2391        // Unlimited stack expansion
2392        fStack->setMaxCapacity(0);
2393    } else {
2394        // Change the units of the limit  from bytes to ints, and bump the size up
2395        //   to be big enough to hold at least one stack frame for the pattern,
2396        //   if it isn't there already.
2397        int32_t adjustedLimit = limit / sizeof(int32_t);
2398        if (adjustedLimit < fPattern->fFrameSize) {
2399            adjustedLimit = fPattern->fFrameSize;
2400        }
2401        fStack->setMaxCapacity(adjustedLimit);
2402    }
2403    fStackLimit = limit;
2404}
2405
2406
2407//--------------------------------------------------------------------------------
2408//
2409//     getStackLimit
2410//
2411//--------------------------------------------------------------------------------
2412int32_t RegexMatcher::getStackLimit() const {
2413    return fStackLimit;
2414}
2415
2416
2417//--------------------------------------------------------------------------------
2418//
2419//     setMatchCallback
2420//
2421//--------------------------------------------------------------------------------
2422void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2423                                    const void              *context,
2424                                    UErrorCode              &status) {
2425    if (U_FAILURE(status)) {
2426        return;
2427    }
2428    fCallbackFn = callback;
2429    fCallbackContext = context;
2430}
2431
2432
2433//--------------------------------------------------------------------------------
2434//
2435//     getMatchCallback
2436//
2437//--------------------------------------------------------------------------------
2438void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2439                                  const void              *&context,
2440                                  UErrorCode              &status) {
2441    if (U_FAILURE(status)) {
2442       return;
2443    }
2444    callback = fCallbackFn;
2445    context  = fCallbackContext;
2446}
2447
2448
2449//--------------------------------------------------------------------------------
2450//
2451//     setMatchCallback
2452//
2453//--------------------------------------------------------------------------------
2454void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2455                                                const void                      *context,
2456                                                UErrorCode                      &status) {
2457    if (U_FAILURE(status)) {
2458        return;
2459    }
2460    fFindProgressCallbackFn = callback;
2461    fFindProgressCallbackContext = context;
2462}
2463
2464
2465//--------------------------------------------------------------------------------
2466//
2467//     getMatchCallback
2468//
2469//--------------------------------------------------------------------------------
2470void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2471                                                const void                    *&context,
2472                                                UErrorCode                    &status) {
2473    if (U_FAILURE(status)) {
2474       return;
2475    }
2476    callback = fFindProgressCallbackFn;
2477    context  = fFindProgressCallbackContext;
2478}
2479
2480
2481//================================================================================
2482//
2483//    Code following this point in this file is the internal
2484//    Match Engine Implementation.
2485//
2486//================================================================================
2487
2488
2489//--------------------------------------------------------------------------------
2490//
2491//   resetStack
2492//           Discard any previous contents of the state save stack, and initialize a
2493//           new stack frame to all -1.  The -1s are needed for capture group limits,
2494//           where they indicate that a group has not yet matched anything.
2495//--------------------------------------------------------------------------------
2496REStackFrame *RegexMatcher::resetStack() {
2497    // Discard any previous contents of the state save stack, and initialize a
2498    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2499    //  where they indicate that a group has not yet matched anything.
2500    fStack->removeAllElements();
2501
2502    REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2503    if(U_FAILURE(fDeferredStatus)) {
2504        return NULL;
2505    }
2506
2507    int32_t i;
2508    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2509        iFrame->fExtra[i] = -1;
2510    }
2511    return iFrame;
2512}
2513
2514
2515
2516//--------------------------------------------------------------------------------
2517//
2518//   isWordBoundary
2519//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2520//                     For us,
2521//                       If the current char is a combining mark,
2522//                          \b is FALSE.
2523//                       Else Scan backwards to the first non-combining char.
2524//                            We are at a boundary if the this char and the original chars are
2525//                               opposite in membership in \w set
2526//
2527//          parameters:   pos   - the current position in the input buffer
2528//
2529//              TODO:  double-check edge cases at region boundaries.
2530//
2531//--------------------------------------------------------------------------------
2532UBool RegexMatcher::isWordBoundary(int64_t pos) {
2533    UBool isBoundary = FALSE;
2534    UBool cIsWord    = FALSE;
2535
2536    if (pos >= fLookLimit) {
2537        fHitEnd = TRUE;
2538    } else {
2539        // Determine whether char c at current position is a member of the word set of chars.
2540        // If we're off the end of the string, behave as though we're not at a word char.
2541        UTEXT_SETNATIVEINDEX(fInputText, pos);
2542        UChar32  c = UTEXT_CURRENT32(fInputText);
2543        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2544            // Current char is a combining one.  Not a boundary.
2545            return FALSE;
2546        }
2547        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2548    }
2549
2550    // Back up until we come to a non-combining char, determine whether
2551    //  that char is a word char.
2552    UBool prevCIsWord = FALSE;
2553    for (;;) {
2554        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2555            break;
2556        }
2557        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2558        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2559              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2560            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2561            break;
2562        }
2563    }
2564    isBoundary = cIsWord ^ prevCIsWord;
2565    return isBoundary;
2566}
2567
2568UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2569    UBool isBoundary = FALSE;
2570    UBool cIsWord    = FALSE;
2571
2572    const UChar *inputBuf = fInputText->chunkContents;
2573
2574    if (pos >= fLookLimit) {
2575        fHitEnd = TRUE;
2576    } else {
2577        // Determine whether char c at current position is a member of the word set of chars.
2578        // If we're off the end of the string, behave as though we're not at a word char.
2579        UChar32 c;
2580        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2581        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2582            // Current char is a combining one.  Not a boundary.
2583            return FALSE;
2584        }
2585        cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2586    }
2587
2588    // Back up until we come to a non-combining char, determine whether
2589    //  that char is a word char.
2590    UBool prevCIsWord = FALSE;
2591    for (;;) {
2592        if (pos <= fLookStart) {
2593            break;
2594        }
2595        UChar32 prevChar;
2596        U16_PREV(inputBuf, fLookStart, pos, prevChar);
2597        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2598              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2599            prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2600            break;
2601        }
2602    }
2603    isBoundary = cIsWord ^ prevCIsWord;
2604    return isBoundary;
2605}
2606
2607//--------------------------------------------------------------------------------
2608//
2609//   isUWordBoundary
2610//
2611//         Test for a word boundary using RBBI word break.
2612//
2613//          parameters:   pos   - the current position in the input buffer
2614//
2615//--------------------------------------------------------------------------------
2616UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2617    UBool       returnVal = FALSE;
2618#if UCONFIG_NO_BREAK_ITERATION==0
2619
2620    // If we haven't yet created a break iterator for this matcher, do it now.
2621    if (fWordBreakItr == NULL) {
2622        fWordBreakItr =
2623            (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2624        if (U_FAILURE(fDeferredStatus)) {
2625            return FALSE;
2626        }
2627        fWordBreakItr->setText(fInputText, fDeferredStatus);
2628    }
2629
2630    if (pos >= fLookLimit) {
2631        fHitEnd = TRUE;
2632        returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2633                            //    words are not boundaries.  All non-word chars stand by themselves,
2634                            //    with word boundaries on both sides.
2635    } else {
2636        if (!UTEXT_USES_U16(fInputText)) {
2637            // !!!: Would like a better way to do this!
2638            UErrorCode status = U_ZERO_ERROR;
2639            pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2640        }
2641        returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2642    }
2643#endif
2644    return   returnVal;
2645}
2646
2647//--------------------------------------------------------------------------------
2648//
2649//   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2650//                     saves. Increment the "time" counter, and call the
2651//                     user callback function if there is one installed.
2652//
2653//                     If the match operation needs to be aborted, either for a time-out
2654//                     or because the user callback asked for it, just set an error status.
2655//                     The engine will pick that up and stop in its outer loop.
2656//
2657//--------------------------------------------------------------------------------
2658void RegexMatcher::IncrementTime(UErrorCode &status) {
2659    fTickCounter = TIMER_INITIAL_VALUE;
2660    fTime++;
2661    if (fCallbackFn != NULL) {
2662        if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2663            status = U_REGEX_STOPPED_BY_CALLER;
2664            return;
2665        }
2666    }
2667    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2668        status = U_REGEX_TIME_OUT;
2669    }
2670}
2671
2672//--------------------------------------------------------------------------------
2673//
2674//   StateSave
2675//       Make a new stack frame, initialized as a copy of the current stack frame.
2676//       Set the pattern index in the original stack frame from the operand value
2677//       in the opcode.  Execution of the engine continues with the state in
2678//       the newly created stack frame
2679//
2680//       Note that reserveBlock() may grow the stack, resulting in the
2681//       whole thing being relocated in memory.
2682//
2683//    Parameters:
2684//       fp           The top frame pointer when called.  At return, a new
2685//                    fame will be present
2686//       savePatIdx   An index into the compiled pattern.  Goes into the original
2687//                    (not new) frame.  If execution ever back-tracks out of the
2688//                    new frame, this will be where we continue from in the pattern.
2689//    Return
2690//                    The new frame pointer.
2691//
2692//--------------------------------------------------------------------------------
2693inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2694    if (U_FAILURE(status)) {
2695        return fp;
2696    }
2697    // push storage for a new frame.
2698    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2699    if (U_FAILURE(status)) {
2700        // Failure on attempted stack expansion.
2701        //   Stack function set some other error code, change it to a more
2702        //   specific one for regular expressions.
2703        status = U_REGEX_STACK_OVERFLOW;
2704        // We need to return a writable stack frame, so just return the
2705        //    previous frame.  The match operation will stop quickly
2706        //    because of the error status, after which the frame will never
2707        //    be looked at again.
2708        return fp;
2709    }
2710    fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2711
2712    // New stack frame = copy of old top frame.
2713    int64_t *source = (int64_t *)fp;
2714    int64_t *dest   = newFP;
2715    for (;;) {
2716        *dest++ = *source++;
2717        if (source == newFP) {
2718            break;
2719        }
2720    }
2721
2722    fTickCounter--;
2723    if (fTickCounter <= 0) {
2724       IncrementTime(status);    // Re-initializes fTickCounter
2725    }
2726    fp->fPatIdx = savePatIdx;
2727    return (REStackFrame *)newFP;
2728}
2729
2730#if defined(REGEX_DEBUG)
2731namespace {
2732UnicodeString StringFromUText(UText *ut) {
2733    UnicodeString result;
2734    for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2735        result.append(c);
2736    }
2737    return result;
2738}
2739}
2740#endif // REGEX_DEBUG
2741
2742
2743//--------------------------------------------------------------------------------
2744//
2745//   MatchAt      This is the actual matching engine.
2746//
2747//                  startIdx:    begin matching a this index.
2748//                  toEnd:       if true, match must extend to end of the input region
2749//
2750//--------------------------------------------------------------------------------
2751void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2752    UBool       isMatch  = FALSE;      // True if the we have a match.
2753
2754    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2755
2756    int32_t     op;                    // Operation from the compiled pattern, split into
2757    int32_t     opType;                //    the opcode
2758    int32_t     opValue;               //    and the operand value.
2759
2760#ifdef REGEX_RUN_DEBUG
2761    if (fTraceDebug) {
2762        printf("MatchAt(startIdx=%ld)\n", startIdx);
2763        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2764        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2765    }
2766#endif
2767
2768    if (U_FAILURE(status)) {
2769        return;
2770    }
2771
2772    //  Cache frequently referenced items from the compiled pattern
2773    //
2774    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2775
2776    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2777    UVector             *sets          = fPattern->fSets;
2778
2779    fFrameSize = fPattern->fFrameSize;
2780    REStackFrame        *fp            = resetStack();
2781    if (U_FAILURE(fDeferredStatus)) {
2782        status = fDeferredStatus;
2783        return;
2784    }
2785
2786    fp->fPatIdx   = 0;
2787    fp->fInputIdx = startIdx;
2788
2789    // Zero out the pattern's static data
2790    int32_t i;
2791    for (i = 0; i<fPattern->fDataSize; i++) {
2792        fData[i] = 0;
2793    }
2794
2795    //
2796    //  Main loop for interpreting the compiled pattern.
2797    //  One iteration of the loop per pattern operation performed.
2798    //
2799    for (;;) {
2800        op      = (int32_t)pat[fp->fPatIdx];
2801        opType  = URX_TYPE(op);
2802        opValue = URX_VAL(op);
2803#ifdef REGEX_RUN_DEBUG
2804        if (fTraceDebug) {
2805            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2806            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2807                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2808            fPattern->dumpOp(fp->fPatIdx);
2809        }
2810#endif
2811        fp->fPatIdx++;
2812
2813        switch (opType) {
2814
2815
2816        case URX_NOP:
2817            break;
2818
2819
2820        case URX_BACKTRACK:
2821            // Force a backtrack.  In some circumstances, the pattern compiler
2822            //   will notice that the pattern can't possibly match anything, and will
2823            //   emit one of these at that point.
2824            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2825            break;
2826
2827
2828        case URX_ONECHAR:
2829            if (fp->fInputIdx < fActiveLimit) {
2830                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2831                UChar32 c = UTEXT_NEXT32(fInputText);
2832                if (c == opValue) {
2833                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2834                    break;
2835                }
2836            } else {
2837                fHitEnd = TRUE;
2838            }
2839            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2840            break;
2841
2842
2843        case URX_STRING:
2844            {
2845                // Test input against a literal string.
2846                // Strings require two slots in the compiled pattern, one for the
2847                //   offset to the string text, and one for the length.
2848
2849                int32_t   stringStartIdx = opValue;
2850                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2851                fp->fPatIdx++;
2852                opType    = URX_TYPE(op);
2853                int32_t stringLen = URX_VAL(op);
2854                U_ASSERT(opType == URX_STRING_LEN);
2855                U_ASSERT(stringLen >= 2);
2856
2857                const UChar *patternString = litText+stringStartIdx;
2858                int32_t patternStringIndex = 0;
2859                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2860                UChar32 inputChar;
2861                UChar32 patternChar;
2862                UBool success = TRUE;
2863                while (patternStringIndex < stringLen) {
2864                    if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2865                        success = FALSE;
2866                        fHitEnd = TRUE;
2867                        break;
2868                    }
2869                    inputChar = UTEXT_NEXT32(fInputText);
2870                    U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2871                    if (patternChar != inputChar) {
2872                        success = FALSE;
2873                        break;
2874                    }
2875                }
2876
2877                if (success) {
2878                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2879                } else {
2880                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2881                }
2882            }
2883            break;
2884
2885
2886        case URX_STATE_SAVE:
2887            fp = StateSave(fp, opValue, status);
2888            break;
2889
2890
2891        case URX_END:
2892            // The match loop will exit via this path on a successful match,
2893            //   when we reach the end of the pattern.
2894            if (toEnd && fp->fInputIdx != fActiveLimit) {
2895                // The pattern matched, but not to the end of input.  Try some more.
2896                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2897                break;
2898            }
2899            isMatch = TRUE;
2900            goto  breakFromLoop;
2901
2902        // Start and End Capture stack frame variables are laid out out like this:
2903            //  fp->fExtra[opValue]  - The start of a completed capture group
2904            //             opValue+1 - The end   of a completed capture group
2905            //             opValue+2 - the start of a capture group whose end
2906            //                          has not yet been reached (and might not ever be).
2907        case URX_START_CAPTURE:
2908            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2909            fp->fExtra[opValue+2] = fp->fInputIdx;
2910            break;
2911
2912
2913        case URX_END_CAPTURE:
2914            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2915            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2916            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2917            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2918            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2919            break;
2920
2921
2922        case URX_DOLLAR:                   //  $, test for End of line
2923                                           //     or for position before new line at end of input
2924            {
2925                if (fp->fInputIdx >= fAnchorLimit) {
2926                    // We really are at the end of input.  Success.
2927                    fHitEnd = TRUE;
2928                    fRequireEnd = TRUE;
2929                    break;
2930                }
2931
2932                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2933
2934                // If we are positioned just before a new-line that is located at the
2935                //   end of input, succeed.
2936                UChar32 c = UTEXT_NEXT32(fInputText);
2937                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2938                    if (isLineTerminator(c)) {
2939                        // If not in the middle of a CR/LF sequence
2940                        if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2941                            // At new-line at end of input. Success
2942                            fHitEnd = TRUE;
2943                            fRequireEnd = TRUE;
2944
2945                            break;
2946                        }
2947                    }
2948                } else {
2949                    UChar32 nextC = UTEXT_NEXT32(fInputText);
2950                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2951                        fHitEnd = TRUE;
2952                        fRequireEnd = TRUE;
2953                        break;                         // At CR/LF at end of input.  Success
2954                    }
2955                }
2956
2957                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2958            }
2959            break;
2960
2961
2962         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
2963            if (fp->fInputIdx >= fAnchorLimit) {
2964                // Off the end of input.  Success.
2965                fHitEnd = TRUE;
2966                fRequireEnd = TRUE;
2967                break;
2968            } else {
2969                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2970                UChar32 c = UTEXT_NEXT32(fInputText);
2971                // Either at the last character of input, or off the end.
2972                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2973                    fHitEnd = TRUE;
2974                    fRequireEnd = TRUE;
2975                    break;
2976                }
2977            }
2978
2979            // Not at end of input.  Back-track out.
2980            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2981            break;
2982
2983
2984         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
2985             {
2986                 if (fp->fInputIdx >= fAnchorLimit) {
2987                     // We really are at the end of input.  Success.
2988                     fHitEnd = TRUE;
2989                     fRequireEnd = TRUE;
2990                     break;
2991                 }
2992                 // If we are positioned just before a new-line, succeed.
2993                 // It makes no difference where the new-line is within the input.
2994                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2995                 UChar32 c = UTEXT_CURRENT32(fInputText);
2996                 if (isLineTerminator(c)) {
2997                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
2998                     //  In multi-line mode, hitting a new-line just before the end of input does not
2999                     //   set the hitEnd or requireEnd flags
3000                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3001                        break;
3002                     }
3003                 }
3004                 // not at a new line.  Fail.
3005                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3006             }
3007             break;
3008
3009
3010         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3011             {
3012                 if (fp->fInputIdx >= fAnchorLimit) {
3013                     // We really are at the end of input.  Success.
3014                     fHitEnd = TRUE;
3015                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3016                     break;               //   adding a new-line would not lose the match.
3017                 }
3018                 // If we are not positioned just before a new-line, the test fails; backtrack out.
3019                 // It makes no difference where the new-line is within the input.
3020                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3021                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3022                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3023                 }
3024             }
3025             break;
3026
3027
3028       case URX_CARET:                    //  ^, test for start of line
3029            if (fp->fInputIdx != fAnchorStart) {
3030                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3031            }
3032            break;
3033
3034
3035       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3036           {
3037               if (fp->fInputIdx == fAnchorStart) {
3038                   // We are at the start input.  Success.
3039                   break;
3040               }
3041               // Check whether character just before the current pos is a new-line
3042               //   unless we are at the end of input
3043               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3044               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3045               if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3046                   //  It's a new-line.  ^ is true.  Success.
3047                   //  TODO:  what should be done with positions between a CR and LF?
3048                   break;
3049               }
3050               // Not at the start of a line.  Fail.
3051               fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3052           }
3053           break;
3054
3055
3056       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3057           {
3058               U_ASSERT(fp->fInputIdx >= fAnchorStart);
3059               if (fp->fInputIdx <= fAnchorStart) {
3060                   // We are at the start input.  Success.
3061                   break;
3062               }
3063               // Check whether character just before the current pos is a new-line
3064               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3065               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3066               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3067               if (c != 0x0a) {
3068                   // Not at the start of a line.  Back-track out.
3069                   fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3070               }
3071           }
3072           break;
3073
3074        case URX_BACKSLASH_B:          // Test for word boundaries
3075            {
3076                UBool success = isWordBoundary(fp->fInputIdx);
3077                success ^= (UBool)(opValue != 0);     // flip sense for \B
3078                if (!success) {
3079                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3080                }
3081            }
3082            break;
3083
3084
3085        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3086            {
3087                UBool success = isUWordBoundary(fp->fInputIdx);
3088                success ^= (UBool)(opValue != 0);     // flip sense for \B
3089                if (!success) {
3090                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3091                }
3092            }
3093            break;
3094
3095
3096        case URX_BACKSLASH_D:            // Test for decimal digit
3097            {
3098                if (fp->fInputIdx >= fActiveLimit) {
3099                    fHitEnd = TRUE;
3100                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3101                    break;
3102                }
3103
3104                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3105
3106                UChar32 c = UTEXT_NEXT32(fInputText);
3107                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3108                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3109                success ^= (UBool)(opValue != 0);        // flip sense for \D
3110                if (success) {
3111                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3112                } else {
3113                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3114                }
3115            }
3116            break;
3117
3118
3119        case URX_BACKSLASH_G:          // Test for position at end of previous match
3120            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3121                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3122            }
3123            break;
3124
3125
3126        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
3127            {
3128                if (fp->fInputIdx >= fActiveLimit) {
3129                    fHitEnd = TRUE;
3130                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3131                    break;
3132                }
3133                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3134                UChar32 c = UTEXT_NEXT32(fInputText);
3135                int8_t ctype = u_charType(c);
3136                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
3137                success ^= (UBool)(opValue != 0);        // flip sense for \H
3138                if (success) {
3139                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3140                } else {
3141                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3142                }
3143            }
3144            break;
3145
3146
3147        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
3148            {
3149                if (fp->fInputIdx >= fActiveLimit) {
3150                    fHitEnd = TRUE;
3151                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3152                    break;
3153                }
3154                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3155                UChar32 c = UTEXT_NEXT32(fInputText);
3156                if (isLineTerminator(c)) {
3157                    if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3158                        utext_next32(fInputText);
3159                    }
3160                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3161                } else {
3162                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3163                }
3164            }
3165            break;
3166
3167
3168        case URX_BACKSLASH_V:            // \v, any single line ending character.
3169            {
3170                if (fp->fInputIdx >= fActiveLimit) {
3171                    fHitEnd = TRUE;
3172                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3173                    break;
3174                }
3175                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3176                UChar32 c = UTEXT_NEXT32(fInputText);
3177                UBool success = isLineTerminator(c);
3178                success ^= (UBool)(opValue != 0);        // flip sense for \V
3179                if (success) {
3180                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3181                } else {
3182                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3183                }
3184            }
3185            break;
3186
3187
3188        case URX_BACKSLASH_X:
3189            //  Match a Grapheme, as defined by Unicode TR 29.
3190            //  Differs slightly from Perl, which consumes combining marks independently
3191            //    of context.
3192            {
3193
3194                // Fail if at end of input
3195                if (fp->fInputIdx >= fActiveLimit) {
3196                    fHitEnd = TRUE;
3197                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3198                    break;
3199                }
3200
3201                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3202
3203                // Examine (and consume) the current char.
3204                //   Dispatch into a little state machine, based on the char.
3205                UChar32  c;
3206                c = UTEXT_NEXT32(fInputText);
3207                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3208                UnicodeSet **sets = fPattern->fStaticSets;
3209                if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3210                if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3211                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3212                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3213                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3214                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3215                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3216                goto GC_Extend;
3217
3218
3219
3220GC_L:
3221                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3222                c = UTEXT_NEXT32(fInputText);
3223                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3224                if (sets[URX_GC_L]->contains(c))       goto GC_L;
3225                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3226                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3227                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3228                (void)UTEXT_PREVIOUS32(fInputText);
3229                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3230                goto GC_Extend;
3231
3232GC_V:
3233                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3234                c = UTEXT_NEXT32(fInputText);
3235                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3236                if (sets[URX_GC_V]->contains(c))       goto GC_V;
3237                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3238                (void)UTEXT_PREVIOUS32(fInputText);
3239                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3240                goto GC_Extend;
3241
3242GC_T:
3243                if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3244                c = UTEXT_NEXT32(fInputText);
3245                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3246                if (sets[URX_GC_T]->contains(c))       goto GC_T;
3247                (void)UTEXT_PREVIOUS32(fInputText);
3248                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3249                goto GC_Extend;
3250
3251GC_Extend:
3252                // Combining characters are consumed here
3253                for (;;) {
3254                    if (fp->fInputIdx >= fActiveLimit) {
3255                        break;
3256                    }
3257                    c = UTEXT_CURRENT32(fInputText);
3258                    if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3259                        break;
3260                    }
3261                    (void)UTEXT_NEXT32(fInputText);
3262                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3263                }
3264                goto GC_Done;
3265
3266GC_Control:
3267                // Most control chars stand alone (don't combine with combining chars),
3268                //   except for that CR/LF sequence is a single grapheme cluster.
3269                if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3270                    c = UTEXT_NEXT32(fInputText);
3271                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3272                }
3273
3274GC_Done:
3275                if (fp->fInputIdx >= fActiveLimit) {
3276                    fHitEnd = TRUE;
3277                }
3278                break;
3279            }
3280
3281
3282
3283
3284        case URX_BACKSLASH_Z:          // Test for end of Input
3285            if (fp->fInputIdx < fAnchorLimit) {
3286                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3287            } else {
3288                fHitEnd = TRUE;
3289                fRequireEnd = TRUE;
3290            }
3291            break;
3292
3293
3294
3295        case URX_STATIC_SETREF:
3296            {
3297                // Test input character against one of the predefined sets
3298                //    (Word Characters, for example)
3299                // The high bit of the op value is a flag for the match polarity.
3300                //    0:   success if input char is in set.
3301                //    1:   success if input char is not in set.
3302                if (fp->fInputIdx >= fActiveLimit) {
3303                    fHitEnd = TRUE;
3304                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3305                    break;
3306                }
3307
3308                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3309                opValue &= ~URX_NEG_SET;
3310                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3311
3312                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3313                UChar32 c = UTEXT_NEXT32(fInputText);
3314                if (c < 256) {
3315                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3316                    if (s8->contains(c)) {
3317                        success = !success;
3318                    }
3319                } else {
3320                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3321                    if (s->contains(c)) {
3322                        success = !success;
3323                    }
3324                }
3325                if (success) {
3326                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3327                } else {
3328                    // the character wasn't in the set.
3329                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3330                }
3331            }
3332            break;
3333
3334
3335        case URX_STAT_SETREF_N:
3336            {
3337                // Test input character for NOT being a member of  one of
3338                //    the predefined sets (Word Characters, for example)
3339                if (fp->fInputIdx >= fActiveLimit) {
3340                    fHitEnd = TRUE;
3341                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3342                    break;
3343                }
3344
3345                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3346
3347                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3348
3349                UChar32 c = UTEXT_NEXT32(fInputText);
3350                if (c < 256) {
3351                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3352                    if (s8->contains(c) == FALSE) {
3353                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3354                        break;
3355                    }
3356                } else {
3357                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
3358                    if (s->contains(c) == FALSE) {
3359                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3360                        break;
3361                    }
3362                }
3363                // the character wasn't in the set.
3364                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3365            }
3366            break;
3367
3368
3369        case URX_SETREF:
3370            if (fp->fInputIdx >= fActiveLimit) {
3371                fHitEnd = TRUE;
3372                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3373                break;
3374            } else {
3375                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3376
3377                // There is input left.  Pick up one char and test it for set membership.
3378                UChar32 c = UTEXT_NEXT32(fInputText);
3379                U_ASSERT(opValue > 0 && opValue < sets->size());
3380                if (c<256) {
3381                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3382                    if (s8->contains(c)) {
3383                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3384                        break;
3385                    }
3386                } else {
3387                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3388                    if (s->contains(c)) {
3389                        // The character is in the set.  A Match.
3390                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3391                        break;
3392                    }
3393                }
3394
3395                // the character wasn't in the set.
3396                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3397            }
3398            break;
3399
3400
3401        case URX_DOTANY:
3402            {
3403                // . matches anything, but stops at end-of-line.
3404                if (fp->fInputIdx >= fActiveLimit) {
3405                    // At end of input.  Match failed.  Backtrack out.
3406                    fHitEnd = TRUE;
3407                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3408                    break;
3409                }
3410
3411                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3412
3413                // There is input left.  Advance over one char, unless we've hit end-of-line
3414                UChar32 c = UTEXT_NEXT32(fInputText);
3415                if (isLineTerminator(c)) {
3416                    // End of line in normal mode.   . does not match.
3417                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3418                    break;
3419                }
3420                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3421            }
3422            break;
3423
3424
3425        case URX_DOTANY_ALL:
3426            {
3427                // ., in dot-matches-all (including new lines) mode
3428                if (fp->fInputIdx >= fActiveLimit) {
3429                    // At end of input.  Match failed.  Backtrack out.
3430                    fHitEnd = TRUE;
3431                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3432                    break;
3433                }
3434
3435                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3436
3437                // There is input left.  Advance over one char, except if we are
3438                //   at a cr/lf, advance over both of them.
3439                UChar32 c;
3440                c = UTEXT_NEXT32(fInputText);
3441                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3442                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3443                    // In the case of a CR/LF, we need to advance over both.
3444                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
3445                    if (nextc == 0x0a) {
3446                        (void)UTEXT_NEXT32(fInputText);
3447                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3448                    }
3449                }
3450            }
3451            break;
3452
3453
3454        case URX_DOTANY_UNIX:
3455            {
3456                // '.' operator, matches all, but stops at end-of-line.
3457                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3458                if (fp->fInputIdx >= fActiveLimit) {
3459                    // At end of input.  Match failed.  Backtrack out.
3460                    fHitEnd = TRUE;
3461                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3462                    break;
3463                }
3464
3465                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3466
3467                // There is input left.  Advance over one char, unless we've hit end-of-line
3468                UChar32 c = UTEXT_NEXT32(fInputText);
3469                if (c == 0x0a) {
3470                    // End of line in normal mode.   '.' does not match the \n
3471                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3472                } else {
3473                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3474                }
3475            }
3476            break;
3477
3478
3479        case URX_JMP:
3480            fp->fPatIdx = opValue;
3481            break;
3482
3483        case URX_FAIL:
3484            isMatch = FALSE;
3485            goto breakFromLoop;
3486
3487        case URX_JMP_SAV:
3488            U_ASSERT(opValue < fPattern->fCompiledPat->size());
3489            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3490            fp->fPatIdx = opValue;                         // Then JMP.
3491            break;
3492
3493        case URX_JMP_SAV_X:
3494            // This opcode is used with (x)+, when x can match a zero length string.
3495            // Same as JMP_SAV, except conditional on the match having made forward progress.
3496            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3497            //   data address of the input position at the start of the loop.
3498            {
3499                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3500                int32_t  stoOp = (int32_t)pat[opValue-1];
3501                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3502                int32_t  frameLoc = URX_VAL(stoOp);
3503                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3504                int64_t prevInputIdx = fp->fExtra[frameLoc];
3505                U_ASSERT(prevInputIdx <= fp->fInputIdx);
3506                if (prevInputIdx < fp->fInputIdx) {
3507                    // The match did make progress.  Repeat the loop.
3508                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3509                    fp->fPatIdx = opValue;
3510                    fp->fExtra[frameLoc] = fp->fInputIdx;
3511                }
3512                // If the input position did not advance, we do nothing here,
3513                //   execution will fall out of the loop.
3514            }
3515            break;
3516
3517        case URX_CTR_INIT:
3518            {
3519                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3520                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3521
3522                // Pick up the three extra operands that CTR_INIT has, and
3523                //    skip the pattern location counter past
3524                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3525                fp->fPatIdx += 3;
3526                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3527                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3528                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3529                U_ASSERT(minCount>=0);
3530                U_ASSERT(maxCount>=minCount || maxCount==-1);
3531                U_ASSERT(loopLoc>=fp->fPatIdx);
3532
3533                if (minCount == 0) {
3534                    fp = StateSave(fp, loopLoc+1, status);
3535                }
3536                if (maxCount == -1) {
3537                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3538                } else if (maxCount == 0) {
3539                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3540                }
3541            }
3542            break;
3543
3544        case URX_CTR_LOOP:
3545            {
3546                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3547                int32_t initOp = (int32_t)pat[opValue];
3548                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3549                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3550                int32_t minCount  = (int32_t)pat[opValue+2];
3551                int32_t maxCount  = (int32_t)pat[opValue+3];
3552                (*pCounter)++;
3553                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3554                    U_ASSERT(*pCounter == maxCount);
3555                    break;
3556                }
3557                if (*pCounter >= minCount) {
3558                    if (maxCount == -1) {
3559                        // Loop has no hard upper bound.
3560                        // Check that it is progressing through the input, break if it is not.
3561                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3562                        if (fp->fInputIdx == *pLastInputIdx) {
3563                            break;
3564                        } else {
3565                            *pLastInputIdx = fp->fInputIdx;
3566                        }
3567                    }
3568                    fp = StateSave(fp, fp->fPatIdx, status);
3569                }
3570                fp->fPatIdx = opValue + 4;    // Loop back.
3571            }
3572            break;
3573
3574        case URX_CTR_INIT_NG:
3575            {
3576                // Initialize a non-greedy loop
3577                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3578                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3579
3580                // Pick up the three extra operands that CTR_INIT_NG has, and
3581                //    skip the pattern location counter past
3582                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3583                fp->fPatIdx += 3;
3584                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3585                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3586                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3587                U_ASSERT(minCount>=0);
3588                U_ASSERT(maxCount>=minCount || maxCount==-1);
3589                U_ASSERT(loopLoc>fp->fPatIdx);
3590                if (maxCount == -1) {
3591                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3592                }
3593
3594                if (minCount == 0) {
3595                    if (maxCount != 0) {
3596                        fp = StateSave(fp, fp->fPatIdx, status);
3597                    }
3598                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3599                }
3600            }
3601            break;
3602
3603        case URX_CTR_LOOP_NG:
3604            {
3605                // Non-greedy {min, max} loops
3606                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3607                int32_t initOp = (int32_t)pat[opValue];
3608                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3609                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3610                int32_t minCount  = (int32_t)pat[opValue+2];
3611                int32_t maxCount  = (int32_t)pat[opValue+3];
3612
3613                (*pCounter)++;
3614                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3615                    // The loop has matched the maximum permitted number of times.
3616                    //   Break out of here with no action.  Matching will
3617                    //   continue with the following pattern.
3618                    U_ASSERT(*pCounter == maxCount);
3619                    break;
3620                }
3621
3622                if (*pCounter < minCount) {
3623                    // We haven't met the minimum number of matches yet.
3624                    //   Loop back for another one.
3625                    fp->fPatIdx = opValue + 4;    // Loop back.
3626                } else {
3627                    // We do have the minimum number of matches.
3628
3629                    // If there is no upper bound on the loop iterations, check that the input index
3630                    // is progressing, and stop the loop if it is not.
3631                    if (maxCount == -1) {
3632                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3633                        if (fp->fInputIdx == *pLastInputIdx) {
3634                            break;
3635                        }
3636                        *pLastInputIdx = fp->fInputIdx;
3637                    }
3638
3639                    // Loop Continuation: we will fall into the pattern following the loop
3640                    //   (non-greedy, don't execute loop body first), but first do
3641                    //   a state save to the top of the loop, so that a match failure
3642                    //   in the following pattern will try another iteration of the loop.
3643                    fp = StateSave(fp, opValue + 4, status);
3644                }
3645            }
3646            break;
3647
3648        case URX_STO_SP:
3649            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3650            fData[opValue] = fStack->size();
3651            break;
3652
3653        case URX_LD_SP:
3654            {
3655                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3656                int32_t newStackSize = (int32_t)fData[opValue];
3657                U_ASSERT(newStackSize <= fStack->size());
3658                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3659                if (newFP == (int64_t *)fp) {
3660                    break;
3661                }
3662                int32_t i;
3663                for (i=0; i<fFrameSize; i++) {
3664                    newFP[i] = ((int64_t *)fp)[i];
3665                }
3666                fp = (REStackFrame *)newFP;
3667                fStack->setSize(newStackSize);
3668            }
3669            break;
3670
3671        case URX_BACKREF:
3672            {
3673                U_ASSERT(opValue < fFrameSize);
3674                int64_t groupStartIdx = fp->fExtra[opValue];
3675                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3676                U_ASSERT(groupStartIdx <= groupEndIdx);
3677                if (groupStartIdx < 0) {
3678                    // This capture group has not participated in the match thus far,
3679                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3680                    break;
3681                }
3682                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3683                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3684
3685                //   Note: if the capture group match was of an empty string the backref
3686                //         match succeeds.  Verified by testing:  Perl matches succeed
3687                //         in this case, so we do too.
3688
3689                UBool success = TRUE;
3690                for (;;) {
3691                    if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3692                        success = TRUE;
3693                        break;
3694                    }
3695                    if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3696                        success = FALSE;
3697                        fHitEnd = TRUE;
3698                        break;
3699                    }
3700                    UChar32 captureGroupChar = utext_next32(fAltInputText);
3701                    UChar32 inputChar = utext_next32(fInputText);
3702                    if (inputChar != captureGroupChar) {
3703                        success = FALSE;
3704                        break;
3705                    }
3706                }
3707
3708                if (success) {
3709                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3710                } else {
3711                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3712                }
3713            }
3714            break;
3715
3716
3717
3718        case URX_BACKREF_I:
3719            {
3720                U_ASSERT(opValue < fFrameSize);
3721                int64_t groupStartIdx = fp->fExtra[opValue];
3722                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3723                U_ASSERT(groupStartIdx <= groupEndIdx);
3724                if (groupStartIdx < 0) {
3725                    // This capture group has not participated in the match thus far,
3726                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3727                    break;
3728                }
3729                utext_setNativeIndex(fAltInputText, groupStartIdx);
3730                utext_setNativeIndex(fInputText, fp->fInputIdx);
3731                CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3732                CaseFoldingUTextIterator inputItr(*fInputText);
3733
3734                //   Note: if the capture group match was of an empty string the backref
3735                //         match succeeds.  Verified by testing:  Perl matches succeed
3736                //         in this case, so we do too.
3737
3738                UBool success = TRUE;
3739                for (;;) {
3740                    if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3741                        success = TRUE;
3742                        break;
3743                    }
3744                    if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3745                        success = FALSE;
3746                        fHitEnd = TRUE;
3747                        break;
3748                    }
3749                    UChar32 captureGroupChar = captureGroupItr.next();
3750                    UChar32 inputChar = inputItr.next();
3751                    if (inputChar != captureGroupChar) {
3752                        success = FALSE;
3753                        break;
3754                    }
3755                }
3756
3757                if (success && inputItr.inExpansion()) {
3758                    // We otained a match by consuming part of a string obtained from
3759                    // case-folding a single code point of the input text.
3760                    // This does not count as an overall match.
3761                    success = FALSE;
3762                }
3763
3764                if (success) {
3765                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3766                } else {
3767                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3768                }
3769
3770            }
3771            break;
3772
3773        case URX_STO_INP_LOC:
3774            {
3775                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3776                fp->fExtra[opValue] = fp->fInputIdx;
3777            }
3778            break;
3779
3780        case URX_JMPX:
3781            {
3782                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3783                fp->fPatIdx += 1;
3784                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3785                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3786                int64_t savedInputIdx = fp->fExtra[dataLoc];
3787                U_ASSERT(savedInputIdx <= fp->fInputIdx);
3788                if (savedInputIdx < fp->fInputIdx) {
3789                    fp->fPatIdx = opValue;                               // JMP
3790                } else {
3791                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3792                }
3793            }
3794            break;
3795
3796        case URX_LA_START:
3797            {
3798                // Entering a lookahead block.
3799                // Save Stack Ptr, Input Pos.
3800                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3801                fData[opValue]   = fStack->size();
3802                fData[opValue+1] = fp->fInputIdx;
3803                fActiveStart     = fLookStart;          // Set the match region change for
3804                fActiveLimit     = fLookLimit;          //   transparent bounds.
3805            }
3806            break;
3807
3808        case URX_LA_END:
3809            {
3810                // Leaving a look-ahead block.
3811                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3812                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3813                int32_t stackSize = fStack->size();
3814                int32_t newStackSize =(int32_t)fData[opValue];
3815                U_ASSERT(stackSize >= newStackSize);
3816                if (stackSize > newStackSize) {
3817                    // Copy the current top frame back to the new (cut back) top frame.
3818                    //   This makes the capture groups from within the look-ahead
3819                    //   expression available.
3820                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3821                    int32_t i;
3822                    for (i=0; i<fFrameSize; i++) {
3823                        newFP[i] = ((int64_t *)fp)[i];
3824                    }
3825                    fp = (REStackFrame *)newFP;
3826                    fStack->setSize(newStackSize);
3827                }
3828                fp->fInputIdx = fData[opValue+1];
3829
3830                // Restore the active region bounds in the input string; they may have
3831                //    been changed because of transparent bounds on a Region.
3832                fActiveStart = fRegionStart;
3833                fActiveLimit = fRegionLimit;
3834            }
3835            break;
3836
3837        case URX_ONECHAR_I:
3838            // Case insensitive one char.  The char from the pattern is already case folded.
3839            // Input text is not, but case folding the input can not reduce two or more code
3840            // points to one.
3841            if (fp->fInputIdx < fActiveLimit) {
3842                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3843
3844                UChar32 c = UTEXT_NEXT32(fInputText);
3845                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3846                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3847                    break;
3848                }
3849            } else {
3850                fHitEnd = TRUE;
3851            }
3852
3853            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3854            break;
3855
3856        case URX_STRING_I:
3857            {
3858                // Case-insensitive test input against a literal string.
3859                // Strings require two slots in the compiled pattern, one for the
3860                //   offset to the string text, and one for the length.
3861                //   The compiled string has already been case folded.
3862                {
3863                    const UChar *patternString = litText + opValue;
3864                    int32_t      patternStringIdx  = 0;
3865
3866                    op      = (int32_t)pat[fp->fPatIdx];
3867                    fp->fPatIdx++;
3868                    opType  = URX_TYPE(op);
3869                    opValue = URX_VAL(op);
3870                    U_ASSERT(opType == URX_STRING_LEN);
3871                    int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3872
3873
3874                    UChar32   cPattern;
3875                    UChar32   cText;
3876                    UBool     success = TRUE;
3877
3878                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3879                    CaseFoldingUTextIterator inputIterator(*fInputText);
3880                    while (patternStringIdx < patternStringLen) {
3881                        if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3882                            success = FALSE;
3883                            fHitEnd = TRUE;
3884                            break;
3885                        }
3886                        U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3887                        cText = inputIterator.next();
3888                        if (cText != cPattern) {
3889                            success = FALSE;
3890                            break;
3891                        }
3892                    }
3893                    if (inputIterator.inExpansion()) {
3894                        success = FALSE;
3895                    }
3896
3897                    if (success) {
3898                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3899                    } else {
3900                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3901                    }
3902                }
3903            }
3904            break;
3905
3906        case URX_LB_START:
3907            {
3908                // Entering a look-behind block.
3909                // Save Stack Ptr, Input Pos.
3910                //   TODO:  implement transparent bounds.  Ticket #6067
3911                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3912                fData[opValue]   = fStack->size();
3913                fData[opValue+1] = fp->fInputIdx;
3914                // Init the variable containing the start index for attempted matches.
3915                fData[opValue+2] = -1;
3916                // Save input string length, then reset to pin any matches to end at
3917                //   the current position.
3918                fData[opValue+3] = fActiveLimit;
3919                fActiveLimit     = fp->fInputIdx;
3920            }
3921            break;
3922
3923
3924        case URX_LB_CONT:
3925            {
3926                // Positive Look-Behind, at top of loop checking for matches of LB expression
3927                //    at all possible input starting positions.
3928
3929                // Fetch the min and max possible match lengths.  They are the operands
3930                //   of this op in the pattern.
3931                int32_t minML = (int32_t)pat[fp->fPatIdx++];
3932                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3933                if (!UTEXT_USES_U16(fInputText)) {
3934                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3935                    // The max length need not be exact; it just needs to be >= actual maximum.
3936                    maxML *= 3;
3937                }
3938                U_ASSERT(minML <= maxML);
3939                U_ASSERT(minML >= 0);
3940
3941                // Fetch (from data) the last input index where a match was attempted.
3942                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3943                int64_t  &lbStartIdx = fData[opValue+2];
3944                if (lbStartIdx < 0) {
3945                    // First time through loop.
3946                    lbStartIdx = fp->fInputIdx - minML;
3947                    if (lbStartIdx > 0) {
3948                        // move index to a code point boudary, if it's not on one already.
3949                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3950                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3951                    }
3952                } else {
3953                    // 2nd through nth time through the loop.
3954                    // Back up start position for match by one.
3955                    if (lbStartIdx == 0) {
3956                        (lbStartIdx)--;
3957                    } else {
3958                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3959                        (void)UTEXT_PREVIOUS32(fInputText);
3960                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3961                    }
3962                }
3963
3964                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3965                    // We have tried all potential match starting points without
3966                    //  getting a match.  Backtrack out, and out of the
3967                    //   Look Behind altogether.
3968                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3969                    int64_t restoreInputLen = fData[opValue+3];
3970                    U_ASSERT(restoreInputLen >= fActiveLimit);
3971                    U_ASSERT(restoreInputLen <= fInputLength);
3972                    fActiveLimit = restoreInputLen;
3973                    break;
3974                }
3975
3976                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3977                //      (successful match will fall off the end of the loop.)
3978                fp = StateSave(fp, fp->fPatIdx-3, status);
3979                fp->fInputIdx = lbStartIdx;
3980            }
3981            break;
3982
3983        case URX_LB_END:
3984            // End of a look-behind block, after a successful match.
3985            {
3986                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3987                if (fp->fInputIdx != fActiveLimit) {
3988                    //  The look-behind expression matched, but the match did not
3989                    //    extend all the way to the point that we are looking behind from.
3990                    //  FAIL out of here, which will take us back to the LB_CONT, which
3991                    //     will retry the match starting at another position or fail
3992                    //     the look-behind altogether, whichever is appropriate.
3993                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3994                    break;
3995                }
3996
3997                // Look-behind match is good.  Restore the orignal input string length,
3998                //   which had been truncated to pin the end of the lookbehind match to the
3999                //   position being looked-behind.
4000                int64_t originalInputLen = fData[opValue+3];
4001                U_ASSERT(originalInputLen >= fActiveLimit);
4002                U_ASSERT(originalInputLen <= fInputLength);
4003                fActiveLimit = originalInputLen;
4004            }
4005            break;
4006
4007
4008        case URX_LBN_CONT:
4009            {
4010                // Negative Look-Behind, at top of loop checking for matches of LB expression
4011                //    at all possible input starting positions.
4012
4013                // Fetch the extra parameters of this op.
4014                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4015                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4016                if (!UTEXT_USES_U16(fInputText)) {
4017                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
4018                    // The max length need not be exact; it just needs to be >= actual maximum.
4019                    maxML *= 3;
4020                }
4021                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4022                        continueLoc = URX_VAL(continueLoc);
4023                U_ASSERT(minML <= maxML);
4024                U_ASSERT(minML >= 0);
4025                U_ASSERT(continueLoc > fp->fPatIdx);
4026
4027                // Fetch (from data) the last input index where a match was attempted.
4028                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4029                int64_t  &lbStartIdx = fData[opValue+2];
4030                if (lbStartIdx < 0) {
4031                    // First time through loop.
4032                    lbStartIdx = fp->fInputIdx - minML;
4033                    if (lbStartIdx > 0) {
4034                        // move index to a code point boudary, if it's not on one already.
4035                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4036                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4037                    }
4038                } else {
4039                    // 2nd through nth time through the loop.
4040                    // Back up start position for match by one.
4041                    if (lbStartIdx == 0) {
4042                        (lbStartIdx)--;
4043                    } else {
4044                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4045                        (void)UTEXT_PREVIOUS32(fInputText);
4046                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4047                    }
4048                }
4049
4050                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4051                    // We have tried all potential match starting points without
4052                    //  getting a match, which means that the negative lookbehind as
4053                    //  a whole has succeeded.  Jump forward to the continue location
4054                    int64_t restoreInputLen = fData[opValue+3];
4055                    U_ASSERT(restoreInputLen >= fActiveLimit);
4056                    U_ASSERT(restoreInputLen <= fInputLength);
4057                    fActiveLimit = restoreInputLen;
4058                    fp->fPatIdx = continueLoc;
4059                    break;
4060                }
4061
4062                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4063                //      (successful match will cause a FAIL out of the loop altogether.)
4064                fp = StateSave(fp, fp->fPatIdx-4, status);
4065                fp->fInputIdx = lbStartIdx;
4066            }
4067            break;
4068
4069        case URX_LBN_END:
4070            // End of a negative look-behind block, after a successful match.
4071            {
4072                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4073                if (fp->fInputIdx != fActiveLimit) {
4074                    //  The look-behind expression matched, but the match did not
4075                    //    extend all the way to the point that we are looking behind from.
4076                    //  FAIL out of here, which will take us back to the LB_CONT, which
4077                    //     will retry the match starting at another position or succeed
4078                    //     the look-behind altogether, whichever is appropriate.
4079                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4080                    break;
4081                }
4082
4083                // Look-behind expression matched, which means look-behind test as
4084                //   a whole Fails
4085
4086                //   Restore the orignal input string length, which had been truncated
4087                //   inorder to pin the end of the lookbehind match
4088                //   to the position being looked-behind.
4089                int64_t originalInputLen = fData[opValue+3];
4090                U_ASSERT(originalInputLen >= fActiveLimit);
4091                U_ASSERT(originalInputLen <= fInputLength);
4092                fActiveLimit = originalInputLen;
4093
4094                // Restore original stack position, discarding any state saved
4095                //   by the successful pattern match.
4096                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4097                int32_t newStackSize = (int32_t)fData[opValue];
4098                U_ASSERT(fStack->size() > newStackSize);
4099                fStack->setSize(newStackSize);
4100
4101                //  FAIL, which will take control back to someplace
4102                //  prior to entering the look-behind test.
4103                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4104            }
4105            break;
4106
4107
4108        case URX_LOOP_SR_I:
4109            // Loop Initialization for the optimized implementation of
4110            //     [some character set]*
4111            //   This op scans through all matching input.
4112            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4113            {
4114                U_ASSERT(opValue > 0 && opValue < sets->size());
4115                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4116                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4117
4118                // Loop through input, until either the input is exhausted or
4119                //   we reach a character that is not a member of the set.
4120                int64_t ix = fp->fInputIdx;
4121                UTEXT_SETNATIVEINDEX(fInputText, ix);
4122                for (;;) {
4123                    if (ix >= fActiveLimit) {
4124                        fHitEnd = TRUE;
4125                        break;
4126                    }
4127                    UChar32 c = UTEXT_NEXT32(fInputText);
4128                    if (c<256) {
4129                        if (s8->contains(c) == FALSE) {
4130                            break;
4131                        }
4132                    } else {
4133                        if (s->contains(c) == FALSE) {
4134                            break;
4135                        }
4136                    }
4137                    ix = UTEXT_GETNATIVEINDEX(fInputText);
4138                }
4139
4140                // If there were no matching characters, skip over the loop altogether.
4141                //   The loop doesn't run at all, a * op always succeeds.
4142                if (ix == fp->fInputIdx) {
4143                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4144                    break;
4145                }
4146
4147                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4148                //   must follow.  It's operand is the stack location
4149                //   that holds the starting input index for the match of this [set]*
4150                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4151                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4152                int32_t stackLoc = URX_VAL(loopcOp);
4153                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4154                fp->fExtra[stackLoc] = fp->fInputIdx;
4155                fp->fInputIdx = ix;
4156
4157                // Save State to the URX_LOOP_C op that follows this one,
4158                //   so that match failures in the following code will return to there.
4159                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4160                fp = StateSave(fp, fp->fPatIdx, status);
4161                fp->fPatIdx++;
4162            }
4163            break;
4164
4165
4166        case URX_LOOP_DOT_I:
4167            // Loop Initialization for the optimized implementation of .*
4168            //   This op scans through all remaining input.
4169            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4170            {
4171                // Loop through input until the input is exhausted (we reach an end-of-line)
4172                // In DOTALL mode, we can just go straight to the end of the input.
4173                int64_t ix;
4174                if ((opValue & 1) == 1) {
4175                    // Dot-matches-All mode.  Jump straight to the end of the string.
4176                    ix = fActiveLimit;
4177                    fHitEnd = TRUE;
4178                } else {
4179                    // NOT DOT ALL mode.  Line endings do not match '.'
4180                    // Scan forward until a line ending or end of input.
4181                    ix = fp->fInputIdx;
4182                    UTEXT_SETNATIVEINDEX(fInputText, ix);
4183                    for (;;) {
4184                        if (ix >= fActiveLimit) {
4185                            fHitEnd = TRUE;
4186                            break;
4187                        }
4188                        UChar32 c = UTEXT_NEXT32(fInputText);
4189                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4190                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4191                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4192                                    isLineTerminator(c))) {
4193                                //  char is a line ending.  Exit the scanning loop.
4194                                break;
4195                            }
4196                        }
4197                        ix = UTEXT_GETNATIVEINDEX(fInputText);
4198                    }
4199                }
4200
4201                // If there were no matching characters, skip over the loop altogether.
4202                //   The loop doesn't run at all, a * op always succeeds.
4203                if (ix == fp->fInputIdx) {
4204                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4205                    break;
4206                }
4207
4208                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4209                //   must follow.  It's operand is the stack location
4210                //   that holds the starting input index for the match of this .*
4211                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4212                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4213                int32_t stackLoc = URX_VAL(loopcOp);
4214                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4215                fp->fExtra[stackLoc] = fp->fInputIdx;
4216                fp->fInputIdx = ix;
4217
4218                // Save State to the URX_LOOP_C op that follows this one,
4219                //   so that match failures in the following code will return to there.
4220                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4221                fp = StateSave(fp, fp->fPatIdx, status);
4222                fp->fPatIdx++;
4223            }
4224            break;
4225
4226
4227        case URX_LOOP_C:
4228            {
4229                U_ASSERT(opValue>=0 && opValue<fFrameSize);
4230                backSearchIndex = fp->fExtra[opValue];
4231                U_ASSERT(backSearchIndex <= fp->fInputIdx);
4232                if (backSearchIndex == fp->fInputIdx) {
4233                    // We've backed up the input idx to the point that the loop started.
4234                    // The loop is done.  Leave here without saving state.
4235                    //  Subsequent failures won't come back here.
4236                    break;
4237                }
4238                // Set up for the next iteration of the loop, with input index
4239                //   backed up by one from the last time through,
4240                //   and a state save to this instruction in case the following code fails again.
4241                //   (We're going backwards because this loop emulates stack unwinding, not
4242                //    the initial scan forward.)
4243                U_ASSERT(fp->fInputIdx > 0);
4244                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4245                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4246                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4247
4248                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4249                if (prevC == 0x0a &&
4250                    fp->fInputIdx > backSearchIndex &&
4251                    twoPrevC == 0x0d) {
4252                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4253                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4254                        // .*, stepping back over CRLF pair.
4255                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4256                    }
4257                }
4258
4259
4260                fp = StateSave(fp, fp->fPatIdx-1, status);
4261            }
4262            break;
4263
4264
4265
4266        default:
4267            // Trouble.  The compiled pattern contains an entry with an
4268            //           unrecognized type tag.
4269            U_ASSERT(FALSE);
4270        }
4271
4272        if (U_FAILURE(status)) {
4273            isMatch = FALSE;
4274            break;
4275        }
4276    }
4277
4278breakFromLoop:
4279    fMatch = isMatch;
4280    if (isMatch) {
4281        fLastMatchEnd = fMatchEnd;
4282        fMatchStart   = startIdx;
4283        fMatchEnd     = fp->fInputIdx;
4284    }
4285
4286#ifdef REGEX_RUN_DEBUG
4287    if (fTraceDebug) {
4288        if (isMatch) {
4289            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4290        } else {
4291            printf("No match\n\n");
4292        }
4293    }
4294#endif
4295
4296    fFrame = fp;                // The active stack frame when the engine stopped.
4297                                //   Contains the capture group results that we need to
4298                                //    access later.
4299    return;
4300}
4301
4302
4303//--------------------------------------------------------------------------------
4304//
4305//   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4306//                  assumption that the entire string is available in the UText's
4307//                  chunk buffer. For now, that means we can use int32_t indexes,
4308//                  except for anything that needs to be saved (like group starts
4309//                  and ends).
4310//
4311//                  startIdx:    begin matching a this index.
4312//                  toEnd:       if true, match must extend to end of the input region
4313//
4314//--------------------------------------------------------------------------------
4315void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4316    UBool       isMatch  = FALSE;      // True if the we have a match.
4317
4318    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4319
4320    int32_t     op;                    // Operation from the compiled pattern, split into
4321    int32_t     opType;                //    the opcode
4322    int32_t     opValue;               //    and the operand value.
4323
4324#ifdef REGEX_RUN_DEBUG
4325    if (fTraceDebug) {
4326        printf("MatchAt(startIdx=%d)\n", startIdx);
4327        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4328        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4329    }
4330#endif
4331
4332    if (U_FAILURE(status)) {
4333        return;
4334    }
4335
4336    //  Cache frequently referenced items from the compiled pattern
4337    //
4338    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4339
4340    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4341    UVector             *sets          = fPattern->fSets;
4342
4343    const UChar         *inputBuf      = fInputText->chunkContents;
4344
4345    fFrameSize = fPattern->fFrameSize;
4346    REStackFrame        *fp            = resetStack();
4347    if (U_FAILURE(fDeferredStatus)) {
4348        status = fDeferredStatus;
4349        return;
4350    }
4351
4352    fp->fPatIdx   = 0;
4353    fp->fInputIdx = startIdx;
4354
4355    // Zero out the pattern's static data
4356    int32_t i;
4357    for (i = 0; i<fPattern->fDataSize; i++) {
4358        fData[i] = 0;
4359    }
4360
4361    //
4362    //  Main loop for interpreting the compiled pattern.
4363    //  One iteration of the loop per pattern operation performed.
4364    //
4365    for (;;) {
4366        op      = (int32_t)pat[fp->fPatIdx];
4367        opType  = URX_TYPE(op);
4368        opValue = URX_VAL(op);
4369#ifdef REGEX_RUN_DEBUG
4370        if (fTraceDebug) {
4371            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4372            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4373                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4374            fPattern->dumpOp(fp->fPatIdx);
4375        }
4376#endif
4377        fp->fPatIdx++;
4378
4379        switch (opType) {
4380
4381
4382        case URX_NOP:
4383            break;
4384
4385
4386        case URX_BACKTRACK:
4387            // Force a backtrack.  In some circumstances, the pattern compiler
4388            //   will notice that the pattern can't possibly match anything, and will
4389            //   emit one of these at that point.
4390            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4391            break;
4392
4393
4394        case URX_ONECHAR:
4395            if (fp->fInputIdx < fActiveLimit) {
4396                UChar32 c;
4397                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4398                if (c == opValue) {
4399                    break;
4400                }
4401            } else {
4402                fHitEnd = TRUE;
4403            }
4404            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4405            break;
4406
4407
4408        case URX_STRING:
4409            {
4410                // Test input against a literal string.
4411                // Strings require two slots in the compiled pattern, one for the
4412                //   offset to the string text, and one for the length.
4413                int32_t   stringStartIdx = opValue;
4414                int32_t   stringLen;
4415
4416                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4417                fp->fPatIdx++;
4418                opType    = URX_TYPE(op);
4419                stringLen = URX_VAL(op);
4420                U_ASSERT(opType == URX_STRING_LEN);
4421                U_ASSERT(stringLen >= 2);
4422
4423                const UChar * pInp = inputBuf + fp->fInputIdx;
4424                const UChar * pInpLimit = inputBuf + fActiveLimit;
4425                const UChar * pPat = litText+stringStartIdx;
4426                const UChar * pEnd = pInp + stringLen;
4427                UBool success = TRUE;
4428                while (pInp < pEnd) {
4429                    if (pInp >= pInpLimit) {
4430                        fHitEnd = TRUE;
4431                        success = FALSE;
4432                        break;
4433                    }
4434                    if (*pInp++ != *pPat++) {
4435                        success = FALSE;
4436                        break;
4437                    }
4438                }
4439
4440                if (success) {
4441                    fp->fInputIdx += stringLen;
4442                } else {
4443                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4444                }
4445            }
4446            break;
4447
4448
4449        case URX_STATE_SAVE:
4450            fp = StateSave(fp, opValue, status);
4451            break;
4452
4453
4454        case URX_END:
4455            // The match loop will exit via this path on a successful match,
4456            //   when we reach the end of the pattern.
4457            if (toEnd && fp->fInputIdx != fActiveLimit) {
4458                // The pattern matched, but not to the end of input.  Try some more.
4459                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4460                break;
4461            }
4462            isMatch = TRUE;
4463            goto  breakFromLoop;
4464
4465            // Start and End Capture stack frame variables are laid out out like this:
4466            //  fp->fExtra[opValue]  - The start of a completed capture group
4467            //             opValue+1 - The end   of a completed capture group
4468            //             opValue+2 - the start of a capture group whose end
4469            //                          has not yet been reached (and might not ever be).
4470        case URX_START_CAPTURE:
4471            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4472            fp->fExtra[opValue+2] = fp->fInputIdx;
4473            break;
4474
4475
4476        case URX_END_CAPTURE:
4477            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4478            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4479            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4480            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4481            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4482            break;
4483
4484
4485        case URX_DOLLAR:                   //  $, test for End of line
4486            //     or for position before new line at end of input
4487            if (fp->fInputIdx < fAnchorLimit-2) {
4488                // We are no where near the end of input.  Fail.
4489                //   This is the common case.  Keep it first.
4490                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4491                break;
4492            }
4493            if (fp->fInputIdx >= fAnchorLimit) {
4494                // We really are at the end of input.  Success.
4495                fHitEnd = TRUE;
4496                fRequireEnd = TRUE;
4497                break;
4498            }
4499
4500            // If we are positioned just before a new-line that is located at the
4501            //   end of input, succeed.
4502            if (fp->fInputIdx == fAnchorLimit-1) {
4503                UChar32 c;
4504                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4505
4506                if (isLineTerminator(c)) {
4507                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4508                        // At new-line at end of input. Success
4509                        fHitEnd = TRUE;
4510                        fRequireEnd = TRUE;
4511                        break;
4512                    }
4513                }
4514            } else if (fp->fInputIdx == fAnchorLimit-2 &&
4515                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4516                    fHitEnd = TRUE;
4517                    fRequireEnd = TRUE;
4518                    break;                         // At CR/LF at end of input.  Success
4519            }
4520
4521            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4522
4523            break;
4524
4525
4526        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4527            if (fp->fInputIdx >= fAnchorLimit-1) {
4528                // Either at the last character of input, or off the end.
4529                if (fp->fInputIdx == fAnchorLimit-1) {
4530                    // At last char of input.  Success if it's a new line.
4531                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4532                        fHitEnd = TRUE;
4533                        fRequireEnd = TRUE;
4534                        break;
4535                    }
4536                } else {
4537                    // Off the end of input.  Success.
4538                    fHitEnd = TRUE;
4539                    fRequireEnd = TRUE;
4540                    break;
4541                }
4542            }
4543
4544            // Not at end of input.  Back-track out.
4545            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4546            break;
4547
4548
4549        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4550            {
4551                if (fp->fInputIdx >= fAnchorLimit) {
4552                    // We really are at the end of input.  Success.
4553                    fHitEnd = TRUE;
4554                    fRequireEnd = TRUE;
4555                    break;
4556                }
4557                // If we are positioned just before a new-line, succeed.
4558                // It makes no difference where the new-line is within the input.
4559                UChar32 c = inputBuf[fp->fInputIdx];
4560                if (isLineTerminator(c)) {
4561                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4562                    //  In multi-line mode, hitting a new-line just before the end of input does not
4563                    //   set the hitEnd or requireEnd flags
4564                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4565                        break;
4566                    }
4567                }
4568                // not at a new line.  Fail.
4569                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4570            }
4571            break;
4572
4573
4574        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4575            {
4576                if (fp->fInputIdx >= fAnchorLimit) {
4577                    // We really are at the end of input.  Success.
4578                    fHitEnd = TRUE;
4579                    fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4580                    break;               //   adding a new-line would not lose the match.
4581                }
4582                // If we are not positioned just before a new-line, the test fails; backtrack out.
4583                // It makes no difference where the new-line is within the input.
4584                if (inputBuf[fp->fInputIdx] != 0x0a) {
4585                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4586                }
4587            }
4588            break;
4589
4590
4591        case URX_CARET:                    //  ^, test for start of line
4592            if (fp->fInputIdx != fAnchorStart) {
4593                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4594            }
4595            break;
4596
4597
4598        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4599            {
4600                if (fp->fInputIdx == fAnchorStart) {
4601                    // We are at the start input.  Success.
4602                    break;
4603                }
4604                // Check whether character just before the current pos is a new-line
4605                //   unless we are at the end of input
4606                UChar  c = inputBuf[fp->fInputIdx - 1];
4607                if ((fp->fInputIdx < fAnchorLimit) &&
4608                    isLineTerminator(c)) {
4609                    //  It's a new-line.  ^ is true.  Success.
4610                    //  TODO:  what should be done with positions between a CR and LF?
4611                    break;
4612                }
4613                // Not at the start of a line.  Fail.
4614                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4615            }
4616            break;
4617
4618
4619        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4620            {
4621                U_ASSERT(fp->fInputIdx >= fAnchorStart);
4622                if (fp->fInputIdx <= fAnchorStart) {
4623                    // We are at the start input.  Success.
4624                    break;
4625                }
4626                // Check whether character just before the current pos is a new-line
4627                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4628                UChar  c = inputBuf[fp->fInputIdx - 1];
4629                if (c != 0x0a) {
4630                    // Not at the start of a line.  Back-track out.
4631                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4632                }
4633            }
4634            break;
4635
4636        case URX_BACKSLASH_B:          // Test for word boundaries
4637            {
4638                UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4639                success ^= (UBool)(opValue != 0);     // flip sense for \B
4640                if (!success) {
4641                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4642                }
4643            }
4644            break;
4645
4646
4647        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4648            {
4649                UBool success = isUWordBoundary(fp->fInputIdx);
4650                success ^= (UBool)(opValue != 0);     // flip sense for \B
4651                if (!success) {
4652                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4653                }
4654            }
4655            break;
4656
4657
4658        case URX_BACKSLASH_D:            // Test for decimal digit
4659            {
4660                if (fp->fInputIdx >= fActiveLimit) {
4661                    fHitEnd = TRUE;
4662                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4663                    break;
4664                }
4665
4666                UChar32 c;
4667                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4668                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4669                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4670                success ^= (UBool)(opValue != 0);        // flip sense for \D
4671                if (!success) {
4672                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4673                }
4674            }
4675            break;
4676
4677
4678        case URX_BACKSLASH_G:          // Test for position at end of previous match
4679            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4680                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4681            }
4682            break;
4683
4684
4685        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
4686            {
4687                if (fp->fInputIdx >= fActiveLimit) {
4688                    fHitEnd = TRUE;
4689                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4690                    break;
4691                }
4692                UChar32 c;
4693                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4694                int8_t ctype = u_charType(c);
4695                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
4696                success ^= (UBool)(opValue != 0);        // flip sense for \H
4697                if (!success) {
4698                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4699                }
4700            }
4701            break;
4702
4703
4704        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
4705            {
4706                if (fp->fInputIdx >= fActiveLimit) {
4707                    fHitEnd = TRUE;
4708                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4709                    break;
4710                }
4711                UChar32 c;
4712                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4713                if (isLineTerminator(c)) {
4714                    if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4715                        // Check for CR/LF sequence. Consume both together when found.
4716                        UChar c2;
4717                        U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4718                        if (c2 != 0x0a) {
4719                            U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4720                        }
4721                    }
4722                } else {
4723                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4724                }
4725            }
4726            break;
4727
4728
4729        case URX_BACKSLASH_V:         // Any single code point line ending.
4730            {
4731                if (fp->fInputIdx >= fActiveLimit) {
4732                    fHitEnd = TRUE;
4733                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4734                    break;
4735                }
4736                UChar32 c;
4737                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4738                UBool success = isLineTerminator(c);
4739                success ^= (UBool)(opValue != 0);        // flip sense for \V
4740                if (!success) {
4741                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4742                }
4743            }
4744            break;
4745
4746
4747
4748        case URX_BACKSLASH_X:
4749        //  Match a Grapheme, as defined by Unicode TR 29.
4750        //  Differs slightly from Perl, which consumes combining marks independently
4751        //    of context.
4752        {
4753
4754            // Fail if at end of input
4755            if (fp->fInputIdx >= fActiveLimit) {
4756                fHitEnd = TRUE;
4757                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4758                break;
4759            }
4760
4761            // Examine (and consume) the current char.
4762            //   Dispatch into a little state machine, based on the char.
4763            UChar32  c;
4764            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4765            UnicodeSet **sets = fPattern->fStaticSets;
4766            if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4767            if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4768            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4769            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4770            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4771            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4772            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4773            goto GC_Extend;
4774
4775
4776
4777GC_L:
4778            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4779            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4780            if (sets[URX_GC_L]->contains(c))       goto GC_L;
4781            if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4782            if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4783            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4784            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4785            goto GC_Extend;
4786
4787GC_V:
4788            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4789            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4790            if (sets[URX_GC_V]->contains(c))       goto GC_V;
4791            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4792            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4793            goto GC_Extend;
4794
4795GC_T:
4796            if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4797            U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4798            if (sets[URX_GC_T]->contains(c))       goto GC_T;
4799            U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4800            goto GC_Extend;
4801
4802GC_Extend:
4803            // Combining characters are consumed here
4804            for (;;) {
4805                if (fp->fInputIdx >= fActiveLimit) {
4806                    break;
4807                }
4808                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4809                if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4810                    U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4811                    break;
4812                }
4813            }
4814            goto GC_Done;
4815
4816GC_Control:
4817            // Most control chars stand alone (don't combine with combining chars),
4818            //   except for that CR/LF sequence is a single grapheme cluster.
4819            if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4820                fp->fInputIdx++;
4821            }
4822
4823GC_Done:
4824            if (fp->fInputIdx >= fActiveLimit) {
4825                fHitEnd = TRUE;
4826            }
4827            break;
4828        }
4829
4830
4831
4832
4833        case URX_BACKSLASH_Z:          // Test for end of Input
4834            if (fp->fInputIdx < fAnchorLimit) {
4835                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4836            } else {
4837                fHitEnd = TRUE;
4838                fRequireEnd = TRUE;
4839            }
4840            break;
4841
4842
4843
4844        case URX_STATIC_SETREF:
4845            {
4846                // Test input character against one of the predefined sets
4847                //    (Word Characters, for example)
4848                // The high bit of the op value is a flag for the match polarity.
4849                //    0:   success if input char is in set.
4850                //    1:   success if input char is not in set.
4851                if (fp->fInputIdx >= fActiveLimit) {
4852                    fHitEnd = TRUE;
4853                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4854                    break;
4855                }
4856
4857                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4858                opValue &= ~URX_NEG_SET;
4859                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4860
4861                UChar32 c;
4862                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4863                if (c < 256) {
4864                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4865                    if (s8->contains(c)) {
4866                        success = !success;
4867                    }
4868                } else {
4869                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
4870                    if (s->contains(c)) {
4871                        success = !success;
4872                    }
4873                }
4874                if (!success) {
4875                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4876                }
4877            }
4878            break;
4879
4880
4881        case URX_STAT_SETREF_N:
4882            {
4883                // Test input character for NOT being a member of  one of
4884                //    the predefined sets (Word Characters, for example)
4885                if (fp->fInputIdx >= fActiveLimit) {
4886                    fHitEnd = TRUE;
4887                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4888                    break;
4889                }
4890
4891                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4892
4893                UChar32  c;
4894                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4895                if (c < 256) {
4896                    Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4897                    if (s8->contains(c) == FALSE) {
4898                        break;
4899                    }
4900                } else {
4901                    const UnicodeSet *s = fPattern->fStaticSets[opValue];
4902                    if (s->contains(c) == FALSE) {
4903                        break;
4904                    }
4905                }
4906                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4907            }
4908            break;
4909
4910
4911        case URX_SETREF:
4912            {
4913                if (fp->fInputIdx >= fActiveLimit) {
4914                    fHitEnd = TRUE;
4915                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4916                    break;
4917                }
4918
4919                U_ASSERT(opValue > 0 && opValue < sets->size());
4920
4921                // There is input left.  Pick up one char and test it for set membership.
4922                UChar32  c;
4923                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4924                if (c<256) {
4925                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4926                    if (s8->contains(c)) {
4927                        // The character is in the set.  A Match.
4928                        break;
4929                    }
4930                } else {
4931                    UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4932                    if (s->contains(c)) {
4933                        // The character is in the set.  A Match.
4934                        break;
4935                    }
4936                }
4937
4938                // the character wasn't in the set.
4939                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4940            }
4941            break;
4942
4943
4944        case URX_DOTANY:
4945            {
4946                // . matches anything, but stops at end-of-line.
4947                if (fp->fInputIdx >= fActiveLimit) {
4948                    // At end of input.  Match failed.  Backtrack out.
4949                    fHitEnd = TRUE;
4950                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4951                    break;
4952                }
4953
4954                // There is input left.  Advance over one char, unless we've hit end-of-line
4955                UChar32  c;
4956                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4957                if (isLineTerminator(c)) {
4958                    // End of line in normal mode.   . does not match.
4959                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4960                    break;
4961                }
4962            }
4963            break;
4964
4965
4966        case URX_DOTANY_ALL:
4967            {
4968                // . in dot-matches-all (including new lines) mode
4969                if (fp->fInputIdx >= fActiveLimit) {
4970                    // At end of input.  Match failed.  Backtrack out.
4971                    fHitEnd = TRUE;
4972                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4973                    break;
4974                }
4975
4976                // There is input left.  Advance over one char, except if we are
4977                //   at a cr/lf, advance over both of them.
4978                UChar32 c;
4979                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4980                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4981                    // In the case of a CR/LF, we need to advance over both.
4982                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4983                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4984                    }
4985                }
4986            }
4987            break;
4988
4989
4990        case URX_DOTANY_UNIX:
4991            {
4992                // '.' operator, matches all, but stops at end-of-line.
4993                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
4994                if (fp->fInputIdx >= fActiveLimit) {
4995                    // At end of input.  Match failed.  Backtrack out.
4996                    fHitEnd = TRUE;
4997                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4998                    break;
4999                }
5000
5001                // There is input left.  Advance over one char, unless we've hit end-of-line
5002                UChar32 c;
5003                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5004                if (c == 0x0a) {
5005                    // End of line in normal mode.   '.' does not match the \n
5006                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5007                }
5008            }
5009            break;
5010
5011
5012        case URX_JMP:
5013            fp->fPatIdx = opValue;
5014            break;
5015
5016        case URX_FAIL:
5017            isMatch = FALSE;
5018            goto breakFromLoop;
5019
5020        case URX_JMP_SAV:
5021            U_ASSERT(opValue < fPattern->fCompiledPat->size());
5022            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5023            fp->fPatIdx = opValue;                         // Then JMP.
5024            break;
5025
5026        case URX_JMP_SAV_X:
5027            // This opcode is used with (x)+, when x can match a zero length string.
5028            // Same as JMP_SAV, except conditional on the match having made forward progress.
5029            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5030            //   data address of the input position at the start of the loop.
5031            {
5032                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5033                int32_t  stoOp = (int32_t)pat[opValue-1];
5034                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5035                int32_t  frameLoc = URX_VAL(stoOp);
5036                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5037                int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5038                U_ASSERT(prevInputIdx <= fp->fInputIdx);
5039                if (prevInputIdx < fp->fInputIdx) {
5040                    // The match did make progress.  Repeat the loop.
5041                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5042                    fp->fPatIdx = opValue;
5043                    fp->fExtra[frameLoc] = fp->fInputIdx;
5044                }
5045                // If the input position did not advance, we do nothing here,
5046                //   execution will fall out of the loop.
5047            }
5048            break;
5049
5050        case URX_CTR_INIT:
5051            {
5052                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5053                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5054
5055                // Pick up the three extra operands that CTR_INIT has, and
5056                //    skip the pattern location counter past
5057                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5058                fp->fPatIdx += 3;
5059                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5060                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5061                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5062                U_ASSERT(minCount>=0);
5063                U_ASSERT(maxCount>=minCount || maxCount==-1);
5064                U_ASSERT(loopLoc>=fp->fPatIdx);
5065
5066                if (minCount == 0) {
5067                    fp = StateSave(fp, loopLoc+1, status);
5068                }
5069                if (maxCount == -1) {
5070                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
5071                } else if (maxCount == 0) {
5072                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5073                }
5074            }
5075            break;
5076
5077        case URX_CTR_LOOP:
5078            {
5079                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5080                int32_t initOp = (int32_t)pat[opValue];
5081                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5082                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5083                int32_t minCount  = (int32_t)pat[opValue+2];
5084                int32_t maxCount  = (int32_t)pat[opValue+3];
5085                (*pCounter)++;
5086                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5087                    U_ASSERT(*pCounter == maxCount);
5088                    break;
5089                }
5090                if (*pCounter >= minCount) {
5091                    if (maxCount == -1) {
5092                        // Loop has no hard upper bound.
5093                        // Check that it is progressing through the input, break if it is not.
5094                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5095                        if (fp->fInputIdx == *pLastInputIdx) {
5096                            break;
5097                        } else {
5098                            *pLastInputIdx = fp->fInputIdx;
5099                        }
5100                    }
5101                    fp = StateSave(fp, fp->fPatIdx, status);
5102                }
5103                fp->fPatIdx = opValue + 4;    // Loop back.
5104            }
5105            break;
5106
5107        case URX_CTR_INIT_NG:
5108            {
5109                // Initialize a non-greedy loop
5110                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5111                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5112
5113                // Pick up the three extra operands that CTR_INIT_NG has, and
5114                //    skip the pattern location counter past
5115                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5116                fp->fPatIdx += 3;
5117                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5118                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5119                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5120                U_ASSERT(minCount>=0);
5121                U_ASSERT(maxCount>=minCount || maxCount==-1);
5122                U_ASSERT(loopLoc>fp->fPatIdx);
5123                if (maxCount == -1) {
5124                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
5125                }
5126
5127                if (minCount == 0) {
5128                    if (maxCount != 0) {
5129                        fp = StateSave(fp, fp->fPatIdx, status);
5130                    }
5131                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5132                }
5133            }
5134            break;
5135
5136        case URX_CTR_LOOP_NG:
5137            {
5138                // Non-greedy {min, max} loops
5139                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5140                int32_t initOp = (int32_t)pat[opValue];
5141                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5142                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5143                int32_t minCount  = (int32_t)pat[opValue+2];
5144                int32_t maxCount  = (int32_t)pat[opValue+3];
5145
5146                (*pCounter)++;
5147                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5148                    // The loop has matched the maximum permitted number of times.
5149                    //   Break out of here with no action.  Matching will
5150                    //   continue with the following pattern.
5151                    U_ASSERT(*pCounter == maxCount);
5152                    break;
5153                }
5154
5155                if (*pCounter < minCount) {
5156                    // We haven't met the minimum number of matches yet.
5157                    //   Loop back for another one.
5158                    fp->fPatIdx = opValue + 4;    // Loop back.
5159                } else {
5160                    // We do have the minimum number of matches.
5161
5162                    // If there is no upper bound on the loop iterations, check that the input index
5163                    // is progressing, and stop the loop if it is not.
5164                    if (maxCount == -1) {
5165                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5166                        if (fp->fInputIdx == *pLastInputIdx) {
5167                            break;
5168                        }
5169                        *pLastInputIdx = fp->fInputIdx;
5170                    }
5171
5172                    // Loop Continuation: we will fall into the pattern following the loop
5173                    //   (non-greedy, don't execute loop body first), but first do
5174                    //   a state save to the top of the loop, so that a match failure
5175                    //   in the following pattern will try another iteration of the loop.
5176                    fp = StateSave(fp, opValue + 4, status);
5177                }
5178            }
5179            break;
5180
5181        case URX_STO_SP:
5182            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5183            fData[opValue] = fStack->size();
5184            break;
5185
5186        case URX_LD_SP:
5187            {
5188                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5189                int32_t newStackSize = (int32_t)fData[opValue];
5190                U_ASSERT(newStackSize <= fStack->size());
5191                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5192                if (newFP == (int64_t *)fp) {
5193                    break;
5194                }
5195                int32_t i;
5196                for (i=0; i<fFrameSize; i++) {
5197                    newFP[i] = ((int64_t *)fp)[i];
5198                }
5199                fp = (REStackFrame *)newFP;
5200                fStack->setSize(newStackSize);
5201            }
5202            break;
5203
5204        case URX_BACKREF:
5205            {
5206                U_ASSERT(opValue < fFrameSize);
5207                int64_t groupStartIdx = fp->fExtra[opValue];
5208                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5209                U_ASSERT(groupStartIdx <= groupEndIdx);
5210                int64_t inputIndex = fp->fInputIdx;
5211                if (groupStartIdx < 0) {
5212                    // This capture group has not participated in the match thus far,
5213                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5214                    break;
5215                }
5216                UBool success = TRUE;
5217                for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5218                    if (inputIndex >= fActiveLimit) {
5219                        success = FALSE;
5220                        fHitEnd = TRUE;
5221                        break;
5222                    }
5223                    if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5224                        success = FALSE;
5225                        break;
5226                    }
5227                }
5228                if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5229                        inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5230                    // Capture group ended with an unpaired lead surrogate.
5231                    // Back reference is not permitted to match lead only of a surrogatge pair.
5232                    success = FALSE;
5233                }
5234                if (success) {
5235                    fp->fInputIdx = inputIndex;
5236                } else {
5237                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5238                }
5239            }
5240            break;
5241
5242        case URX_BACKREF_I:
5243            {
5244                U_ASSERT(opValue < fFrameSize);
5245                int64_t groupStartIdx = fp->fExtra[opValue];
5246                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5247                U_ASSERT(groupStartIdx <= groupEndIdx);
5248                if (groupStartIdx < 0) {
5249                    // This capture group has not participated in the match thus far,
5250                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5251                    break;
5252                }
5253                CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5254                CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5255
5256                //   Note: if the capture group match was of an empty string the backref
5257                //         match succeeds.  Verified by testing:  Perl matches succeed
5258                //         in this case, so we do too.
5259
5260                UBool success = TRUE;
5261                for (;;) {
5262                    UChar32 captureGroupChar = captureGroupItr.next();
5263                    if (captureGroupChar == U_SENTINEL) {
5264                        success = TRUE;
5265                        break;
5266                    }
5267                    UChar32 inputChar = inputItr.next();
5268                    if (inputChar == U_SENTINEL) {
5269                        success = FALSE;
5270                        fHitEnd = TRUE;
5271                        break;
5272                    }
5273                    if (inputChar != captureGroupChar) {
5274                        success = FALSE;
5275                        break;
5276                    }
5277                }
5278
5279                if (success && inputItr.inExpansion()) {
5280                    // We otained a match by consuming part of a string obtained from
5281                    // case-folding a single code point of the input text.
5282                    // This does not count as an overall match.
5283                    success = FALSE;
5284                }
5285
5286                if (success) {
5287                    fp->fInputIdx = inputItr.getIndex();
5288                } else {
5289                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5290                }
5291            }
5292            break;
5293
5294        case URX_STO_INP_LOC:
5295            {
5296                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5297                fp->fExtra[opValue] = fp->fInputIdx;
5298            }
5299            break;
5300
5301        case URX_JMPX:
5302            {
5303                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5304                fp->fPatIdx += 1;
5305                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5306                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5307                int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5308                U_ASSERT(savedInputIdx <= fp->fInputIdx);
5309                if (savedInputIdx < fp->fInputIdx) {
5310                    fp->fPatIdx = opValue;                               // JMP
5311                } else {
5312                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5313                }
5314            }
5315            break;
5316
5317        case URX_LA_START:
5318            {
5319                // Entering a lookahead block.
5320                // Save Stack Ptr, Input Pos.
5321                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5322                fData[opValue]   = fStack->size();
5323                fData[opValue+1] = fp->fInputIdx;
5324                fActiveStart     = fLookStart;          // Set the match region change for
5325                fActiveLimit     = fLookLimit;          //   transparent bounds.
5326            }
5327            break;
5328
5329        case URX_LA_END:
5330            {
5331                // Leaving a look-ahead block.
5332                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5333                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5334                int32_t stackSize = fStack->size();
5335                int32_t newStackSize = (int32_t)fData[opValue];
5336                U_ASSERT(stackSize >= newStackSize);
5337                if (stackSize > newStackSize) {
5338                    // Copy the current top frame back to the new (cut back) top frame.
5339                    //   This makes the capture groups from within the look-ahead
5340                    //   expression available.
5341                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5342                    int32_t i;
5343                    for (i=0; i<fFrameSize; i++) {
5344                        newFP[i] = ((int64_t *)fp)[i];
5345                    }
5346                    fp = (REStackFrame *)newFP;
5347                    fStack->setSize(newStackSize);
5348                }
5349                fp->fInputIdx = fData[opValue+1];
5350
5351                // Restore the active region bounds in the input string; they may have
5352                //    been changed because of transparent bounds on a Region.
5353                fActiveStart = fRegionStart;
5354                fActiveLimit = fRegionLimit;
5355            }
5356            break;
5357
5358        case URX_ONECHAR_I:
5359            if (fp->fInputIdx < fActiveLimit) {
5360                UChar32 c;
5361                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5362                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5363                    break;
5364                }
5365            } else {
5366                fHitEnd = TRUE;
5367            }
5368            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5369            break;
5370
5371        case URX_STRING_I:
5372            // Case-insensitive test input against a literal string.
5373            // Strings require two slots in the compiled pattern, one for the
5374            //   offset to the string text, and one for the length.
5375            //   The compiled string has already been case folded.
5376            {
5377                const UChar *patternString = litText + opValue;
5378
5379                op      = (int32_t)pat[fp->fPatIdx];
5380                fp->fPatIdx++;
5381                opType  = URX_TYPE(op);
5382                opValue = URX_VAL(op);
5383                U_ASSERT(opType == URX_STRING_LEN);
5384                int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5385
5386                UChar32      cText;
5387                UChar32      cPattern;
5388                UBool        success = TRUE;
5389                int32_t      patternStringIdx  = 0;
5390                CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5391                while (patternStringIdx < patternStringLen) {
5392                    U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5393                    cText = inputIterator.next();
5394                    if (cText != cPattern) {
5395                        success = FALSE;
5396                        if (cText == U_SENTINEL) {
5397                            fHitEnd = TRUE;
5398                        }
5399                        break;
5400                    }
5401                }
5402                if (inputIterator.inExpansion()) {
5403                    success = FALSE;
5404                }
5405
5406                if (success) {
5407                    fp->fInputIdx = inputIterator.getIndex();
5408                } else {
5409                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5410                }
5411            }
5412            break;
5413
5414        case URX_LB_START:
5415            {
5416                // Entering a look-behind block.
5417                // Save Stack Ptr, Input Pos.
5418                //   TODO:  implement transparent bounds.  Ticket #6067
5419                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5420                fData[opValue]   = fStack->size();
5421                fData[opValue+1] = fp->fInputIdx;
5422                // Init the variable containing the start index for attempted matches.
5423                fData[opValue+2] = -1;
5424                // Save input string length, then reset to pin any matches to end at
5425                //   the current position.
5426                fData[opValue+3] = fActiveLimit;
5427                fActiveLimit     = fp->fInputIdx;
5428            }
5429            break;
5430
5431
5432        case URX_LB_CONT:
5433            {
5434                // Positive Look-Behind, at top of loop checking for matches of LB expression
5435                //    at all possible input starting positions.
5436
5437                // Fetch the min and max possible match lengths.  They are the operands
5438                //   of this op in the pattern.
5439                int32_t minML = (int32_t)pat[fp->fPatIdx++];
5440                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5441                U_ASSERT(minML <= maxML);
5442                U_ASSERT(minML >= 0);
5443
5444                // Fetch (from data) the last input index where a match was attempted.
5445                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5446                int64_t  &lbStartIdx = fData[opValue+2];
5447                if (lbStartIdx < 0) {
5448                    // First time through loop.
5449                    lbStartIdx = fp->fInputIdx - minML;
5450                    if (lbStartIdx > 0) {
5451                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5452                    }
5453                } else {
5454                    // 2nd through nth time through the loop.
5455                    // Back up start position for match by one.
5456                    if (lbStartIdx == 0) {
5457                        lbStartIdx--;
5458                    } else {
5459                        U16_BACK_1(inputBuf, 0, lbStartIdx);
5460                    }
5461                }
5462
5463                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5464                    // We have tried all potential match starting points without
5465                    //  getting a match.  Backtrack out, and out of the
5466                    //   Look Behind altogether.
5467                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5468                    int64_t restoreInputLen = fData[opValue+3];
5469                    U_ASSERT(restoreInputLen >= fActiveLimit);
5470                    U_ASSERT(restoreInputLen <= fInputLength);
5471                    fActiveLimit = restoreInputLen;
5472                    break;
5473                }
5474
5475                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5476                //      (successful match will fall off the end of the loop.)
5477                fp = StateSave(fp, fp->fPatIdx-3, status);
5478                fp->fInputIdx =  lbStartIdx;
5479            }
5480            break;
5481
5482        case URX_LB_END:
5483            // End of a look-behind block, after a successful match.
5484            {
5485                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5486                if (fp->fInputIdx != fActiveLimit) {
5487                    //  The look-behind expression matched, but the match did not
5488                    //    extend all the way to the point that we are looking behind from.
5489                    //  FAIL out of here, which will take us back to the LB_CONT, which
5490                    //     will retry the match starting at another position or fail
5491                    //     the look-behind altogether, whichever is appropriate.
5492                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5493                    break;
5494                }
5495
5496                // Look-behind match is good.  Restore the orignal input string length,
5497                //   which had been truncated to pin the end of the lookbehind match to the
5498                //   position being looked-behind.
5499                int64_t originalInputLen = fData[opValue+3];
5500                U_ASSERT(originalInputLen >= fActiveLimit);
5501                U_ASSERT(originalInputLen <= fInputLength);
5502                fActiveLimit = originalInputLen;
5503            }
5504            break;
5505
5506
5507        case URX_LBN_CONT:
5508            {
5509                // Negative Look-Behind, at top of loop checking for matches of LB expression
5510                //    at all possible input starting positions.
5511
5512                // Fetch the extra parameters of this op.
5513                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5514                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5515                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5516                continueLoc = URX_VAL(continueLoc);
5517                U_ASSERT(minML <= maxML);
5518                U_ASSERT(minML >= 0);
5519                U_ASSERT(continueLoc > fp->fPatIdx);
5520
5521                // Fetch (from data) the last input index where a match was attempted.
5522                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5523                int64_t  &lbStartIdx = fData[opValue+2];
5524                if (lbStartIdx < 0) {
5525                    // First time through loop.
5526                    lbStartIdx = fp->fInputIdx - minML;
5527                    if (lbStartIdx > 0) {
5528                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5529                    }
5530                } else {
5531                    // 2nd through nth time through the loop.
5532                    // Back up start position for match by one.
5533                    if (lbStartIdx == 0) {
5534                        lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
5535                    } else {
5536                        U16_BACK_1(inputBuf, 0, lbStartIdx);
5537                    }
5538                }
5539
5540                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5541                    // We have tried all potential match starting points without
5542                    //  getting a match, which means that the negative lookbehind as
5543                    //  a whole has succeeded.  Jump forward to the continue location
5544                    int64_t restoreInputLen = fData[opValue+3];
5545                    U_ASSERT(restoreInputLen >= fActiveLimit);
5546                    U_ASSERT(restoreInputLen <= fInputLength);
5547                    fActiveLimit = restoreInputLen;
5548                    fp->fPatIdx = continueLoc;
5549                    break;
5550                }
5551
5552                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5553                //      (successful match will cause a FAIL out of the loop altogether.)
5554                fp = StateSave(fp, fp->fPatIdx-4, status);
5555                fp->fInputIdx =  lbStartIdx;
5556            }
5557            break;
5558
5559        case URX_LBN_END:
5560            // End of a negative look-behind block, after a successful match.
5561            {
5562                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5563                if (fp->fInputIdx != fActiveLimit) {
5564                    //  The look-behind expression matched, but the match did not
5565                    //    extend all the way to the point that we are looking behind from.
5566                    //  FAIL out of here, which will take us back to the LB_CONT, which
5567                    //     will retry the match starting at another position or succeed
5568                    //     the look-behind altogether, whichever is appropriate.
5569                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5570                    break;
5571                }
5572
5573                // Look-behind expression matched, which means look-behind test as
5574                //   a whole Fails
5575
5576                //   Restore the orignal input string length, which had been truncated
5577                //   inorder to pin the end of the lookbehind match
5578                //   to the position being looked-behind.
5579                int64_t originalInputLen = fData[opValue+3];
5580                U_ASSERT(originalInputLen >= fActiveLimit);
5581                U_ASSERT(originalInputLen <= fInputLength);
5582                fActiveLimit = originalInputLen;
5583
5584                // Restore original stack position, discarding any state saved
5585                //   by the successful pattern match.
5586                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5587                int32_t newStackSize = (int32_t)fData[opValue];
5588                U_ASSERT(fStack->size() > newStackSize);
5589                fStack->setSize(newStackSize);
5590
5591                //  FAIL, which will take control back to someplace
5592                //  prior to entering the look-behind test.
5593                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5594            }
5595            break;
5596
5597
5598        case URX_LOOP_SR_I:
5599            // Loop Initialization for the optimized implementation of
5600            //     [some character set]*
5601            //   This op scans through all matching input.
5602            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5603            {
5604                U_ASSERT(opValue > 0 && opValue < sets->size());
5605                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5606                UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5607
5608                // Loop through input, until either the input is exhausted or
5609                //   we reach a character that is not a member of the set.
5610                int32_t ix = (int32_t)fp->fInputIdx;
5611                for (;;) {
5612                    if (ix >= fActiveLimit) {
5613                        fHitEnd = TRUE;
5614                        break;
5615                    }
5616                    UChar32   c;
5617                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
5618                    if (c<256) {
5619                        if (s8->contains(c) == FALSE) {
5620                            U16_BACK_1(inputBuf, 0, ix);
5621                            break;
5622                        }
5623                    } else {
5624                        if (s->contains(c) == FALSE) {
5625                            U16_BACK_1(inputBuf, 0, ix);
5626                            break;
5627                        }
5628                    }
5629                }
5630
5631                // If there were no matching characters, skip over the loop altogether.
5632                //   The loop doesn't run at all, a * op always succeeds.
5633                if (ix == fp->fInputIdx) {
5634                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5635                    break;
5636                }
5637
5638                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5639                //   must follow.  It's operand is the stack location
5640                //   that holds the starting input index for the match of this [set]*
5641                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5642                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5643                int32_t stackLoc = URX_VAL(loopcOp);
5644                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5645                fp->fExtra[stackLoc] = fp->fInputIdx;
5646                fp->fInputIdx = ix;
5647
5648                // Save State to the URX_LOOP_C op that follows this one,
5649                //   so that match failures in the following code will return to there.
5650                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5651                fp = StateSave(fp, fp->fPatIdx, status);
5652                fp->fPatIdx++;
5653            }
5654            break;
5655
5656
5657        case URX_LOOP_DOT_I:
5658            // Loop Initialization for the optimized implementation of .*
5659            //   This op scans through all remaining input.
5660            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5661            {
5662                // Loop through input until the input is exhausted (we reach an end-of-line)
5663                // In DOTALL mode, we can just go straight to the end of the input.
5664                int32_t ix;
5665                if ((opValue & 1) == 1) {
5666                    // Dot-matches-All mode.  Jump straight to the end of the string.
5667                    ix = (int32_t)fActiveLimit;
5668                    fHitEnd = TRUE;
5669                } else {
5670                    // NOT DOT ALL mode.  Line endings do not match '.'
5671                    // Scan forward until a line ending or end of input.
5672                    ix = (int32_t)fp->fInputIdx;
5673                    for (;;) {
5674                        if (ix >= fActiveLimit) {
5675                            fHitEnd = TRUE;
5676                            break;
5677                        }
5678                        UChar32   c;
5679                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5680                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5681                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5682                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5683                                   isLineTerminator(c))) {
5684                                //  char is a line ending.  Put the input pos back to the
5685                                //    line ending char, and exit the scanning loop.
5686                                U16_BACK_1(inputBuf, 0, ix);
5687                                break;
5688                            }
5689                        }
5690                    }
5691                }
5692
5693                // If there were no matching characters, skip over the loop altogether.
5694                //   The loop doesn't run at all, a * op always succeeds.
5695                if (ix == fp->fInputIdx) {
5696                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5697                    break;
5698                }
5699
5700                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5701                //   must follow.  It's operand is the stack location
5702                //   that holds the starting input index for the match of this .*
5703                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5704                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5705                int32_t stackLoc = URX_VAL(loopcOp);
5706                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5707                fp->fExtra[stackLoc] = fp->fInputIdx;
5708                fp->fInputIdx = ix;
5709
5710                // Save State to the URX_LOOP_C op that follows this one,
5711                //   so that match failures in the following code will return to there.
5712                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5713                fp = StateSave(fp, fp->fPatIdx, status);
5714                fp->fPatIdx++;
5715            }
5716            break;
5717
5718
5719        case URX_LOOP_C:
5720            {
5721                U_ASSERT(opValue>=0 && opValue<fFrameSize);
5722                backSearchIndex = (int32_t)fp->fExtra[opValue];
5723                U_ASSERT(backSearchIndex <= fp->fInputIdx);
5724                if (backSearchIndex == fp->fInputIdx) {
5725                    // We've backed up the input idx to the point that the loop started.
5726                    // The loop is done.  Leave here without saving state.
5727                    //  Subsequent failures won't come back here.
5728                    break;
5729                }
5730                // Set up for the next iteration of the loop, with input index
5731                //   backed up by one from the last time through,
5732                //   and a state save to this instruction in case the following code fails again.
5733                //   (We're going backwards because this loop emulates stack unwinding, not
5734                //    the initial scan forward.)
5735                U_ASSERT(fp->fInputIdx > 0);
5736                UChar32 prevC;
5737                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5738
5739                if (prevC == 0x0a &&
5740                    fp->fInputIdx > backSearchIndex &&
5741                    inputBuf[fp->fInputIdx-1] == 0x0d) {
5742                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5743                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5744                        // .*, stepping back over CRLF pair.
5745                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5746                    }
5747                }
5748
5749
5750                fp = StateSave(fp, fp->fPatIdx-1, status);
5751            }
5752            break;
5753
5754
5755
5756        default:
5757            // Trouble.  The compiled pattern contains an entry with an
5758            //           unrecognized type tag.
5759            U_ASSERT(FALSE);
5760        }
5761
5762        if (U_FAILURE(status)) {
5763            isMatch = FALSE;
5764            break;
5765        }
5766    }
5767
5768breakFromLoop:
5769    fMatch = isMatch;
5770    if (isMatch) {
5771        fLastMatchEnd = fMatchEnd;
5772        fMatchStart   = startIdx;
5773        fMatchEnd     = fp->fInputIdx;
5774    }
5775
5776#ifdef REGEX_RUN_DEBUG
5777    if (fTraceDebug) {
5778        if (isMatch) {
5779            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5780        } else {
5781            printf("No match\n\n");
5782        }
5783    }
5784#endif
5785
5786    fFrame = fp;                // The active stack frame when the engine stopped.
5787                                //   Contains the capture group results that we need to
5788                                //    access later.
5789
5790    return;
5791}
5792
5793
5794UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5795
5796U_NAMESPACE_END
5797
5798#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5799